diff mbox series

[FFmpeg-devel] avfilter/select: evaluate silencedetect metadata

Message ID 20210617202304.15639-1-timo@rothenpieler.org
State New
Headers show
Series [FFmpeg-devel] avfilter/select: evaluate silencedetect metadata | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Timo Rothenpieler June 17, 2021, 8:23 p.m. UTC
---
 doc/filters.texi       |  6 ++++++
 libavfilter/f_select.c | 18 ++++++++++++++++++
 libavfilter/version.h  |  2 +-
 3 files changed, 25 insertions(+), 1 deletion(-)

Comments

Gyan Doshi June 18, 2021, 4:19 a.m. UTC | #1
On 2021-06-18 01:53, Timo Rothenpieler wrote:
> ---
>   doc/filters.texi       |  6 ++++++
>   libavfilter/f_select.c | 18 ++++++++++++++++++
>   libavfilter/version.h  |  2 +-
>   3 files changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/doc/filters.texi b/doc/filters.texi
> index da8f7d7726..db6ecd7c2a 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -5404,6 +5404,7 @@ Set sidechain gain. Default is 1. Range is from 0.015625 to 64.
>   
>   This filter supports the all above options as @ref{commands}.
>   
> +@anchor{silencedetect}
>   @section silencedetect
>   
>   Detect silence in an audio stream.
> @@ -25564,6 +25565,11 @@ missing.
>   That basically means that an input frame is selected if its pts is within the
>   interval set by the concat demuxer.
>   
> +@item silence_detected
> +Evaluates the metadata added to frames by the @ref{silencedetect} filter.
> +The @var{silence_detected} variable is -1 if silence was detected for the current
> +frame, 0 otherwise.
> +

Instead of a specific option for silencedetect, it would be future-proof 
if it was an option called, say, metadata with a constant for 
silencedetect to start with.
There are multiple per-frame analysis filters like blackframe, 
blackdetect, freezedetect..etc and it will be easier to just extend 
'metadata' with new constants
than to add a new option for each filter.

>   @end table
>   
>   The default value of the select expression is "1".
> diff --git a/libavfilter/f_select.c b/libavfilter/f_select.c
> index f0468078e8..1e655290f4 100644
> --- a/libavfilter/f_select.c
> +++ b/libavfilter/f_select.c
> @@ -86,6 +86,8 @@ static const char *const var_names[] = {
>   
>       "concatdec_select",  ///< frame is within the interval set by the concat demuxer
>   
> +    "silence_detected",  ///< silencedetect detected silence for this frame
> +
>       NULL
>   };
>   
> @@ -138,6 +140,8 @@ enum var_name {
>   
>       VAR_CONCATDEC_SELECT,
>   
> +    VAR_SILENCE_DETECTED,
> +
>       VAR_VARS_NB
>   };
>   
> @@ -157,6 +161,7 @@ typedef struct SelectContext {
>       double select;
>       int select_out;                 ///< mark the selected output pad index
>       int nb_outputs;
> +    int silence_detected;
>   } SelectContext;
>   
>   #define OFFSET(x) offsetof(SelectContext, x)
> @@ -325,6 +330,18 @@ static double get_concatdec_select(AVFrame *frame, int64_t pts)
>       return NAN;
>   }
>   
> +static double get_silence_detected(SelectContext *select, AVFrame *frame)
> +{
> +    AVDictionary *metadata = frame->metadata;
> +    AVDictionaryEntry *start = av_dict_get(metadata, "lavfi.silence_start", NULL, 0);
> +    AVDictionaryEntry *end = av_dict_get(metadata, "lavfi.silence_end", NULL, 0);
> +    if (start)
> +        select->silence_detected = -1;
> +    if (end)
> +        select->silence_detected = 0;
> +    return select->silence_detected;
> +}
> +
>   static void select_frame(AVFilterContext *ctx, AVFrame *frame)
>   {
>       SelectContext *select = ctx->priv;
> @@ -342,6 +359,7 @@ static void select_frame(AVFilterContext *ctx, AVFrame *frame)
>       select->var_values[VAR_POS] = frame->pkt_pos == -1 ? NAN : frame->pkt_pos;
>       select->var_values[VAR_KEY] = frame->key_frame;
>       select->var_values[VAR_CONCATDEC_SELECT] = get_concatdec_select(frame, av_rescale_q(frame->pts, inlink->time_base, AV_TIME_BASE_Q));
> +    select->var_values[VAR_SILENCE_DETECTED] = get_silence_detected(select, frame);
>   
>       switch (inlink->type) {
>       case AVMEDIA_TYPE_AUDIO:
> diff --git a/libavfilter/version.h b/libavfilter/version.h
> index 5052681653..fbb81ef31c 100644
> --- a/libavfilter/version.h
> +++ b/libavfilter/version.h
> @@ -31,7 +31,7 @@
>   
>   #define LIBAVFILTER_VERSION_MAJOR   8
>   #define LIBAVFILTER_VERSION_MINOR   0
> -#define LIBAVFILTER_VERSION_MICRO 102
> +#define LIBAVFILTER_VERSION_MICRO 103
>   
>   
>   #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
Timo Rothenpieler June 18, 2021, 11:32 a.m. UTC | #2
On 18.06.2021 06:19, Gyan Doshi wrote:
> Instead of a specific option for silencedetect, it would be future-proof 
> if it was an option called, say, metadata with a constant for 
> silencedetect to start with.
> There are multiple per-frame analysis filters like blackframe, 
> blackdetect, freezedetect..etc and it will be easier to just extend 
> 'metadata' with new constants
> than to add a new option for each filter.

The only issue I see with that is a loss of flexibility.
I does not seem like you can pass parameters to variables in the 
expression syntax.
So there would need to be two new parameters for the select filter itself.
The start and the end marker metadata name.

But what if a user wanted to combine multiple detection filters?
I guess chaining multiple select filters would work, but can be pretty 
annoying, depending on the usecase.
Gyan Doshi June 18, 2021, 11:46 a.m. UTC | #3
On 2021-06-18 17:02, Timo Rothenpieler wrote:
> On 18.06.2021 06:19, Gyan Doshi wrote:
>> Instead of a specific option for silencedetect, it would be 
>> future-proof if it was an option called, say, metadata with a 
>> constant for silencedetect to start with.
>> There are multiple per-frame analysis filters like blackframe, 
>> blackdetect, freezedetect..etc and it will be easier to just extend 
>> 'metadata' with new constants
>> than to add a new option for each filter.
>
> The only issue I see with that is a loss of flexibility.
> I does not seem like you can pass parameters to variables in the 
> expression syntax.
> So there would need to be two new parameters for the select filter 
> itself.
> The start and the end marker metadata name.
>
> But what if a user wanted to combine multiple detection filters?
> I guess chaining multiple select filters would work, but can be pretty 
> annoying, depending on the usecase.
>
Yes, ideally, we would associate a function for metadata that allows it 
to be used flexibly within the expr.

See how drawtext text parameter can accommodate various functions. 
There's a limited example (last one) at 
http://www.ffmpeg.org/ffmpeg-filters.html#Examples-62

Regards,
Gyan
Timo Rothenpieler June 18, 2021, 11:57 a.m. UTC | #4
On 18.06.2021 13:46, Gyan Doshi wrote:
> 
> 
> On 2021-06-18 17:02, Timo Rothenpieler wrote:
>> On 18.06.2021 06:19, Gyan Doshi wrote:
>>> Instead of a specific option for silencedetect, it would be 
>>> future-proof if it was an option called, say, metadata with a 
>>> constant for silencedetect to start with.
>>> There are multiple per-frame analysis filters like blackframe, 
>>> blackdetect, freezedetect..etc and it will be easier to just extend 
>>> 'metadata' with new constants
>>> than to add a new option for each filter.
>>
>> The only issue I see with that is a loss of flexibility.
>> I does not seem like you can pass parameters to variables in the 
>> expression syntax.
>> So there would need to be two new parameters for the select filter 
>> itself.
>> The start and the end marker metadata name.
>>
>> But what if a user wanted to combine multiple detection filters?
>> I guess chaining multiple select filters would work, but can be pretty 
>> annoying, depending on the usecase.
>>
> Yes, ideally, we would associate a function for metadata that allows it 
> to be used flexibly within the expr.
> 
> See how drawtext text parameter can accommodate various functions. 
> There's a limited example (last one) at 
> http://www.ffmpeg.org/ffmpeg-filters.html#Examples-62

The drawtext filter does not use expression evaluation for its text 
parameter.
It implements its own logic for that, and it's purely text-replace.

The expression parser does support functions, but only functions with 
one or two numeric arguments.
So it'll have to be something like detected(silence) and then silence is 
a constant numeric variable that the detected function resolves to the 
specific metadata names.
Gyan Doshi June 18, 2021, 12:35 p.m. UTC | #5
On 2021-06-18 17:27, Timo Rothenpieler wrote:
> On 18.06.2021 13:46, Gyan Doshi wrote:
>>
>>
>> On 2021-06-18 17:02, Timo Rothenpieler wrote:
>>> On 18.06.2021 06:19, Gyan Doshi wrote:
>>>> Instead of a specific option for silencedetect, it would be 
>>>> future-proof if it was an option called, say, metadata with a 
>>>> constant for silencedetect to start with.
>>>> There are multiple per-frame analysis filters like blackframe, 
>>>> blackdetect, freezedetect..etc and it will be easier to just extend 
>>>> 'metadata' with new constants
>>>> than to add a new option for each filter.
>>>
>>> The only issue I see with that is a loss of flexibility.
>>> I does not seem like you can pass parameters to variables in the 
>>> expression syntax.
>>> So there would need to be two new parameters for the select filter 
>>> itself.
>>> The start and the end marker metadata name.
>>>
>>> But what if a user wanted to combine multiple detection filters?
>>> I guess chaining multiple select filters would work, but can be 
>>> pretty annoying, depending on the usecase.
>>>
>> Yes, ideally, we would associate a function for metadata that allows 
>> it to be used flexibly within the expr.
>>
>> See how drawtext text parameter can accommodate various functions. 
>> There's a limited example (last one) at 
>> http://www.ffmpeg.org/ffmpeg-filters.html#Examples-62
>
> The drawtext filter does not use expression evaluation for its text 
> parameter.
> It implements its own logic for that, and it's purely text-replace.
>
> The expression parser does support functions, but only functions with 
> one or two numeric arguments.
> So it'll have to be something like detected(silence) and then silence 
> is a constant numeric variable that the detected function resolves to 
> the specific metadata names.

Yeah, so I guess this patch can go ahead as-is, and I'll look into 
extending eval with a  metadata(key)   function for a generic facility.

Regards,
Gyan
Timo Rothenpieler June 18, 2021, 12:38 p.m. UTC | #6
On 18.06.2021 14:35, Gyan Doshi wrote:
>> The drawtext filter does not use expression evaluation for its text 
>> parameter.
>> It implements its own logic for that, and it's purely text-replace.
>>
>> The expression parser does support functions, but only functions with 
>> one or two numeric arguments.
>> So it'll have to be something like detected(silence) and then silence 
>> is a constant numeric variable that the detected function resolves to 
>> the specific metadata names.
> 
> Yeah, so I guess this patch can go ahead as-is, and I'll look into 
> extending eval with a  metadata(key)   function for a generic facility.

I don't think this can be easily added to the expression parser, since 
it needs to keep track of the internal state.
Various kinds of metadatas work very differently as well.

I'm almost finished changing this patch to a more generic way, where new 
detection filters that follow the principle of silencedetect can be 
added easily.
diff mbox series

Patch

diff --git a/doc/filters.texi b/doc/filters.texi
index da8f7d7726..db6ecd7c2a 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -5404,6 +5404,7 @@  Set sidechain gain. Default is 1. Range is from 0.015625 to 64.
 
 This filter supports the all above options as @ref{commands}.
 
+@anchor{silencedetect}
 @section silencedetect
 
 Detect silence in an audio stream.
@@ -25564,6 +25565,11 @@  missing.
 That basically means that an input frame is selected if its pts is within the
 interval set by the concat demuxer.
 
+@item silence_detected
+Evaluates the metadata added to frames by the @ref{silencedetect} filter.
+The @var{silence_detected} variable is -1 if silence was detected for the current
+frame, 0 otherwise.
+
 @end table
 
 The default value of the select expression is "1".
diff --git a/libavfilter/f_select.c b/libavfilter/f_select.c
index f0468078e8..1e655290f4 100644
--- a/libavfilter/f_select.c
+++ b/libavfilter/f_select.c
@@ -86,6 +86,8 @@  static const char *const var_names[] = {
 
     "concatdec_select",  ///< frame is within the interval set by the concat demuxer
 
+    "silence_detected",  ///< silencedetect detected silence for this frame
+
     NULL
 };
 
@@ -138,6 +140,8 @@  enum var_name {
 
     VAR_CONCATDEC_SELECT,
 
+    VAR_SILENCE_DETECTED,
+
     VAR_VARS_NB
 };
 
@@ -157,6 +161,7 @@  typedef struct SelectContext {
     double select;
     int select_out;                 ///< mark the selected output pad index
     int nb_outputs;
+    int silence_detected;
 } SelectContext;
 
 #define OFFSET(x) offsetof(SelectContext, x)
@@ -325,6 +330,18 @@  static double get_concatdec_select(AVFrame *frame, int64_t pts)
     return NAN;
 }
 
+static double get_silence_detected(SelectContext *select, AVFrame *frame)
+{
+    AVDictionary *metadata = frame->metadata;
+    AVDictionaryEntry *start = av_dict_get(metadata, "lavfi.silence_start", NULL, 0);
+    AVDictionaryEntry *end = av_dict_get(metadata, "lavfi.silence_end", NULL, 0);
+    if (start)
+        select->silence_detected = -1;
+    if (end)
+        select->silence_detected = 0;
+    return select->silence_detected;
+}
+
 static void select_frame(AVFilterContext *ctx, AVFrame *frame)
 {
     SelectContext *select = ctx->priv;
@@ -342,6 +359,7 @@  static void select_frame(AVFilterContext *ctx, AVFrame *frame)
     select->var_values[VAR_POS] = frame->pkt_pos == -1 ? NAN : frame->pkt_pos;
     select->var_values[VAR_KEY] = frame->key_frame;
     select->var_values[VAR_CONCATDEC_SELECT] = get_concatdec_select(frame, av_rescale_q(frame->pts, inlink->time_base, AV_TIME_BASE_Q));
+    select->var_values[VAR_SILENCE_DETECTED] = get_silence_detected(select, frame);
 
     switch (inlink->type) {
     case AVMEDIA_TYPE_AUDIO:
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 5052681653..fbb81ef31c 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@ 
 
 #define LIBAVFILTER_VERSION_MAJOR   8
 #define LIBAVFILTER_VERSION_MINOR   0
-#define LIBAVFILTER_VERSION_MICRO 102
+#define LIBAVFILTER_VERSION_MICRO 103
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \