diff mbox

[FFmpeg-devel,1/3] lavf/f_select: support scenecut with more pixel formats

Message ID 1563231238-20966-1-git-send-email-lance.lmwang@gmail.com
State Superseded
Headers show

Commit Message

Lance Wang July 15, 2019, 10:53 p.m. UTC
From: Limin Wang <lance.lmwang@gmail.com>

This patch haven't make other pixel format usable yet to make sure the test
result is same with rgb format.

Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
---
 libavfilter/f_select.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

Comments

Marton Balint July 16, 2019, 10:58 p.m. UTC | #1
On Tue, 16 Jul 2019, lance.lmwang@gmail.com wrote:

> From: Limin Wang <lance.lmwang@gmail.com>
>
> This patch haven't make other pixel format usable yet to make sure the test
> result is same with rgb format.
>
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
> libavfilter/f_select.c | 34 ++++++++++++++++++++++++++++++----
> 1 file changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/libavfilter/f_select.c b/libavfilter/f_select.c
> index 1132375..eed8df3 100644
> --- a/libavfilter/f_select.c
> +++ b/libavfilter/f_select.c
> @@ -28,6 +28,8 @@
> #include "libavutil/fifo.h"
> #include "libavutil/internal.h"
> #include "libavutil/opt.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/pixdesc.h"
> #include "avfilter.h"
> #include "audio.h"
> #include "formats.h"
> @@ -144,6 +146,10 @@ typedef struct SelectContext {
>     char *expr_str;
>     AVExpr *expr;
>     double var_values[VAR_VARS_NB];
> +    int bitdepth;
> +    int nb_planes;
> +    ptrdiff_t width[4];
> +    ptrdiff_t height[4];
>     int do_scene_detect;            ///< 1 if the expression requires scene detection variables, 0 otherwise
>     ff_scene_sad_fn sad;            ///< Sum of the absolute difference function (scene detect only)
>     double prev_mafd;               ///< previous MAFD                           (scene detect only)
> @@ -202,6 +208,17 @@ static av_cold int init(AVFilterContext *ctx)
> static int config_input(AVFilterLink *inlink)
> {
>     SelectContext *select = inlink->dst->priv;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
> +
> +    select->bitdepth = desc->comp[0].depth;
> +    select->nb_planes = av_pix_fmt_count_planes(inlink->format);
> +    for (int plane = 0; plane < select->nb_planes; plane++) {
> +        ptrdiff_t line_size = av_image_get_linesize(inlink->format, inlink->w, plane);
> +        int vsub = desc->log2_chroma_h;
> +
> +        select->width[plane] = line_size >> (select->bitdepth > 8);
> +        select->height[plane] = plane == 1 || plane == 2 ?  AV_CEIL_RSHIFT(inlink->h, vsub) : inlink->h;
> +    }
>
>     select->var_values[VAR_N]          = 0.0;
>     select->var_values[VAR_SELECTED_N] = 0.0;
> @@ -242,7 +259,7 @@ static int config_input(AVFilterLink *inlink)
>         inlink->type == AVMEDIA_TYPE_AUDIO ? inlink->sample_rate : NAN;
>
>     if (CONFIG_SELECT_FILTER && select->do_scene_detect) {
> -        select->sad = ff_scene_sad_get_fn(8);
> +        select->sad = ff_scene_sad_get_fn(select->bitdepth == 8 ? 8 : 16);
>         if (!select->sad)
>             return AVERROR(EINVAL);
>     }
> @@ -258,12 +275,21 @@ static double get_scene_score(AVFilterContext *ctx, AVFrame *frame)
>     if (prev_picref &&
>         frame->height == prev_picref->height &&
>         frame->width  == prev_picref->width) {
> -        uint64_t sad;
> +        uint64_t sad = 0;
>         double mafd, diff;
> +        int count = 0;
> +
> +        for (int plane = 0; plane < select->nb_planes; plane++) {
> +            uint64_t plane_sad;
> +            select->sad(prev_picref->data[plane], prev_picref->linesize[plane],
> +                    frame->data[plane], frame->linesize[plane],
> +                    select->width[plane], select->height[plane], &plane_sad);
> +            sad += plane_sad;
> +            count += select->width[plane] * select->height[plane];
> +        }
> 
> -        select->sad(prev_picref->data[0], prev_picref->linesize[0], frame->data[0], frame->linesize[0], frame->width * 3, frame->height, &sad);
>         emms_c();
> -        mafd = (double)sad / (frame->width * 3 * frame->height);
> +        mafd = (double)sad / count;

mafd was in [0..255] before you added >8 bitdepth support. For >8 bit you 
have to normalize it to this range becuase the metric should be bitdepth 
invariant.

Regards,
Marton

>         diff = fabs(mafd - select->prev_mafd);
>         ret  = av_clipf(FFMIN(mafd, diff) / 100., 0, 1);
>         select->prev_mafd = mafd;
> -- 
> 2.6.4
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lance Wang July 17, 2019, 1 a.m. UTC | #2
On Wed, Jul 17, 2019 at 12:58:04AM +0200, Marton Balint wrote:
> 
> 
> On Tue, 16 Jul 2019, lance.lmwang@gmail.com wrote:
> 
> >From: Limin Wang <lance.lmwang@gmail.com>
> >
> >This patch haven't make other pixel format usable yet to make sure the test
> >result is same with rgb format.
> >
> >Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> >---
> >libavfilter/f_select.c | 34 ++++++++++++++++++++++++++++++----
> >1 file changed, 30 insertions(+), 4 deletions(-)
> >
> >diff --git a/libavfilter/f_select.c b/libavfilter/f_select.c
> >index 1132375..eed8df3 100644
> >--- a/libavfilter/f_select.c
> >+++ b/libavfilter/f_select.c
> >@@ -28,6 +28,8 @@
> >#include "libavutil/fifo.h"
> >#include "libavutil/internal.h"
> >#include "libavutil/opt.h"
> >+#include "libavutil/imgutils.h"
> >+#include "libavutil/pixdesc.h"
> >#include "avfilter.h"
> >#include "audio.h"
> >#include "formats.h"
> >@@ -144,6 +146,10 @@ typedef struct SelectContext {
> >    char *expr_str;
> >    AVExpr *expr;
> >    double var_values[VAR_VARS_NB];
> >+    int bitdepth;
> >+    int nb_planes;
> >+    ptrdiff_t width[4];
> >+    ptrdiff_t height[4];
> >    int do_scene_detect;            ///< 1 if the expression requires scene detection variables, 0 otherwise
> >    ff_scene_sad_fn sad;            ///< Sum of the absolute difference function (scene detect only)
> >    double prev_mafd;               ///< previous MAFD                           (scene detect only)
> >@@ -202,6 +208,17 @@ static av_cold int init(AVFilterContext *ctx)
> >static int config_input(AVFilterLink *inlink)
> >{
> >    SelectContext *select = inlink->dst->priv;
> >+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
> >+
> >+    select->bitdepth = desc->comp[0].depth;
> >+    select->nb_planes = av_pix_fmt_count_planes(inlink->format);
> >+    for (int plane = 0; plane < select->nb_planes; plane++) {
> >+        ptrdiff_t line_size = av_image_get_linesize(inlink->format, inlink->w, plane);
> >+        int vsub = desc->log2_chroma_h;
> >+
> >+        select->width[plane] = line_size >> (select->bitdepth > 8);
> >+        select->height[plane] = plane == 1 || plane == 2 ?  AV_CEIL_RSHIFT(inlink->h, vsub) : inlink->h;
> >+    }
> >
> >    select->var_values[VAR_N]          = 0.0;
> >    select->var_values[VAR_SELECTED_N] = 0.0;
> >@@ -242,7 +259,7 @@ static int config_input(AVFilterLink *inlink)
> >        inlink->type == AVMEDIA_TYPE_AUDIO ? inlink->sample_rate : NAN;
> >
> >    if (CONFIG_SELECT_FILTER && select->do_scene_detect) {
> >-        select->sad = ff_scene_sad_get_fn(8);
> >+        select->sad = ff_scene_sad_get_fn(select->bitdepth == 8 ? 8 : 16);
> >        if (!select->sad)
> >            return AVERROR(EINVAL);
> >    }
> >@@ -258,12 +275,21 @@ static double get_scene_score(AVFilterContext *ctx, AVFrame *frame)
> >    if (prev_picref &&
> >        frame->height == prev_picref->height &&
> >        frame->width  == prev_picref->width) {
> >-        uint64_t sad;
> >+        uint64_t sad = 0;
> >        double mafd, diff;
> >+        int count = 0;
> >+
> >+        for (int plane = 0; plane < select->nb_planes; plane++) {
> >+            uint64_t plane_sad;
> >+            select->sad(prev_picref->data[plane], prev_picref->linesize[plane],
> >+                    frame->data[plane], frame->linesize[plane],
> >+                    select->width[plane], select->height[plane], &plane_sad);
> >+            sad += plane_sad;
> >+            count += select->width[plane] * select->height[plane];
> >+        }
> >
> >-        select->sad(prev_picref->data[0], prev_picref->linesize[0], frame->data[0], frame->linesize[0], frame->width * 3, frame->height, &sad);
> >        emms_c();
> >-        mafd = (double)sad / (frame->width * 3 * frame->height);
> >+        mafd = (double)sad / count;
> 
> mafd was in [0..255] before you added >8 bitdepth support. For >8
> bit you have to normalize it to this range becuase the metric should
> be bitdepth invariant.
Yes, I think 100 should be change to 1ULL << bitdepth. However if change
it, it'll make the threshold is very different with old way. So I
haven't clear how to touched it to keep constant with the old. 

I'll try get one HDR 10bit sample to test the 10bit, I guess my current 
10bit sample isn't real 10bit.


> 
> Regards,
> Marton
> 
> >        diff = fabs(mafd - select->prev_mafd);
> >        ret  = av_clipf(FFMIN(mafd, diff) / 100., 0, 1);
> >        select->prev_mafd = mafd;
> >-- 
> >2.6.4
> >
> >_______________________________________________
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lance Wang July 17, 2019, 11:47 a.m. UTC | #3
On Wed, Jul 17, 2019 at 12:58:04AM +0200, Marton Balint wrote:
> 
> 
> On Tue, 16 Jul 2019, lance.lmwang@gmail.com wrote:
> 
> >From: Limin Wang <lance.lmwang@gmail.com>
> >
> >This patch haven't make other pixel format usable yet to make sure the test
> >result is same with rgb format.
> >
> >Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> >---
> >libavfilter/f_select.c | 34 ++++++++++++++++++++++++++++++----
> >1 file changed, 30 insertions(+), 4 deletions(-)
> >
> >diff --git a/libavfilter/f_select.c b/libavfilter/f_select.c
> >index 1132375..eed8df3 100644
> >--- a/libavfilter/f_select.c
> >+++ b/libavfilter/f_select.c
> >@@ -28,6 +28,8 @@
> >#include "libavutil/fifo.h"
> >#include "libavutil/internal.h"
> >#include "libavutil/opt.h"
> >+#include "libavutil/imgutils.h"
> >+#include "libavutil/pixdesc.h"
> >#include "avfilter.h"
> >#include "audio.h"
> >#include "formats.h"
> >@@ -144,6 +146,10 @@ typedef struct SelectContext {
> >    char *expr_str;
> >    AVExpr *expr;
> >    double var_values[VAR_VARS_NB];
> >+    int bitdepth;
> >+    int nb_planes;
> >+    ptrdiff_t width[4];
> >+    ptrdiff_t height[4];
> >    int do_scene_detect;            ///< 1 if the expression requires scene detection variables, 0 otherwise
> >    ff_scene_sad_fn sad;            ///< Sum of the absolute difference function (scene detect only)
> >    double prev_mafd;               ///< previous MAFD                           (scene detect only)
> >@@ -202,6 +208,17 @@ static av_cold int init(AVFilterContext *ctx)
> >static int config_input(AVFilterLink *inlink)
> >{
> >    SelectContext *select = inlink->dst->priv;
> >+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
> >+
> >+    select->bitdepth = desc->comp[0].depth;
> >+    select->nb_planes = av_pix_fmt_count_planes(inlink->format);
> >+    for (int plane = 0; plane < select->nb_planes; plane++) {
> >+        ptrdiff_t line_size = av_image_get_linesize(inlink->format, inlink->w, plane);
> >+        int vsub = desc->log2_chroma_h;
> >+
> >+        select->width[plane] = line_size >> (select->bitdepth > 8);
> >+        select->height[plane] = plane == 1 || plane == 2 ?  AV_CEIL_RSHIFT(inlink->h, vsub) : inlink->h;
> >+    }
> >
> >    select->var_values[VAR_N]          = 0.0;
> >    select->var_values[VAR_SELECTED_N] = 0.0;
> >@@ -242,7 +259,7 @@ static int config_input(AVFilterLink *inlink)
> >        inlink->type == AVMEDIA_TYPE_AUDIO ? inlink->sample_rate : NAN;
> >
> >    if (CONFIG_SELECT_FILTER && select->do_scene_detect) {
> >-        select->sad = ff_scene_sad_get_fn(8);
> >+        select->sad = ff_scene_sad_get_fn(select->bitdepth == 8 ? 8 : 16);
> >        if (!select->sad)
> >            return AVERROR(EINVAL);
> >    }
> >@@ -258,12 +275,21 @@ static double get_scene_score(AVFilterContext *ctx, AVFrame *frame)
> >    if (prev_picref &&
> >        frame->height == prev_picref->height &&
> >        frame->width  == prev_picref->width) {
> >-        uint64_t sad;
> >+        uint64_t sad = 0;
> >        double mafd, diff;
> >+        int count = 0;
> >+
> >+        for (int plane = 0; plane < select->nb_planes; plane++) {
> >+            uint64_t plane_sad;
> >+            select->sad(prev_picref->data[plane], prev_picref->linesize[plane],
> >+                    frame->data[plane], frame->linesize[plane],
> >+                    select->width[plane], select->height[plane], &plane_sad);
> >+            sad += plane_sad;
> >+            count += select->width[plane] * select->height[plane];
> >+        }
> >
> >-        select->sad(prev_picref->data[0], prev_picref->linesize[0], frame->data[0], frame->linesize[0], frame->width * 3, frame->height, &sad);
> >        emms_c();
> >-        mafd = (double)sad / (frame->width * 3 * frame->height);
> >+        mafd = (double)sad / count;
> 
> mafd was in [0..255] before you added >8 bitdepth support. For >8
> bit you have to normalize it to this range becuase the metric should
> be bitdepth invariant.

I have redo the testing for the 10bit sample, it's downloaded from here:
http://samples.ffmpeg.org/4khdr/Passengers_Breakfast_4K.mkv

1. tested with the master without any change, have detected 5 scenecut

$./ffmpeg -y -i ~/Movies/Passengers_Breakfast_4K.mkv   -vf select='gt(scene\,0.25),showinfo,scale=320x240,tile'  -frames 1 mosaic.png 
[Parsed_showinfo_1 @ 0x7ff92ce91dc0] n:   0 pts:   3587 pts_time:3.587 pos: 20598502 fmt:rgb24 sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:08D7000B plane_checksum:[08D7000B] mean:[65] stdev:[53.5]
[Parsed_showinfo_1 @ 0x7ff92ce91dc0] n:   1 pts:  16225 pts_time:16.225 pos: 90031475 fmt:rgb24 sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:9BEE9923 plane_checksum:[9BEE9923] mean:[81] stdev:[53.7]
[Parsed_showinfo_1 @ 0x7ff92ce91dc0] n:   2 pts:  19645 pts_time:19.645 pos:106768564 fmt:rgb24 sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:A2B439F8 plane_checksum:[A2B439F8] mean:[69] stdev:[52.9]
[Parsed_showinfo_1 @ 0x7ff92ce91dc0] n:   3 pts:  22314 pts_time:22.314 pos:121715002 fmt:rgb24 sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:1E5606B6 plane_checksum:[1E5606B6] mean:[81] stdev:[53.5]

2. tested with the patch applied on master(using 100 for the normalize)
have detected 11 scenecut

$./ffmpeg -y -i ~/Movies/Passengers_Breakfast_4K.mkv   -vf select='gt(scene\,0.25),showinfo,scale=320x240,tile'  -frames 1 mosaic.png 
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   0 pts:   3587 pts_time:3.587 pos: 20598502 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:05C7AB58 plane_checksum:[176E5F9E 502F52BC CA49F8EF] mean:[58 45 43] stdev:[74.7 89.7 90.2]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   1 pts:   4838 pts_time:4.838 pos: 28136867 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:961DC284 plane_checksum:[609A150C 2262DD54 BCDDD015] mean:[55 41 55] stdev:[71.2 85.8 100.2]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   2 pts:   6548 pts_time:6.548 pos: 38226025 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:D1576867 plane_checksum:[0C3D1DD3 DB07BCB1 6B3E8DD4] mean:[60 43 49] stdev:[76.7 87.8 95.4]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   3 pts:   9969 pts_time:9.969 pos: 58231491 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:39AA967A plane_checksum:[0F7D99F7 08590047 2AF5FC2D] mean:[59 43 42] stdev:[77.6 89.1 89.2]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   4 pts:  12471 pts_time:12.471 pos: 69526490 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:D615B095 plane_checksum:[F4792F0E 450BDB7E 3BE6A5FA] mean:[59 38 53] stdev:[76.6 82.4 98.8]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   5 pts:  16225 pts_time:16.225 pos: 90031475 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:B420C139 plane_checksum:[80235567 B28BFCF6 BF026ECD] mean:[57 25 71] stdev:[78.3 65.7 110.1]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   6 pts:  19645 pts_time:19.645 pos:106768564 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:DC9A15E4 plane_checksum:[586501E3 EBBD5E2D FBDEB5C5] mean:[61 40 48] stdev:[77.2 84.6 94.2]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   7 pts:  22314 pts_time:22.314 pos:121715002 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:0B76FC64 plane_checksum:[2F1184E4 486CE1E0 E7B99591] mean:[58 26 70] stdev:[78.1 67.7 109.7]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   8 pts:  24817 pts_time:24.817 pos:133677324 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:5994D7F3 plane_checksum:[9FAA00A6 E4CBBBD9 F05C1B74] mean:[59 32 61] stdev:[75.4 74.4 103.9]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:   9 pts:  25651 pts_time:25.651 pos:139418957 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:9BC5D77A plane_checksum:[9158C550 222A8750 027C8ACB] mean:[59 25 73] stdev:[79.7 64.9 111.3]
[Parsed_showinfo_1 @ 0x7ff6f7d0bc00] n:  10 pts:  26652 pts_time:26.652 pos:144476906 fmt:yuv420p10le sar:1/1 s:3840x2160 i:P iskey:1 type:I checksum:A91EF909 plane_checksum:[78B2A2D9 E82A9A8D 753BBB94] mean:[55 19 54] stdev:[73.8 55.4 101.0] 

3. tested with hardwave decode to p010le, it result isn't right for the
normaalize issue

$ ./ffmpeg -y -hwaccel videotoolbox  -i ~/Movies/Passengers_Breakfast_4K.mkv   -vf select='gt(scene\,0.25),showinfo,scale=320x240,tile'  -frames 1 samsung.png


4. tested with hardware decode and change the 100 to 1ULL << bitdepth for normalize, have detected 13 scenecuts

$ ./ffmpeg -y -hwaccel videotoolbox  -i ~/Movies/Passengers_Breakfast_4K.mkv   -vf select='gt(scene\,0.25),showinfo,scale=320x240,tile'  -frames 1 samsung.png


I have tested with other samples with the same behavior. For software
decode with yuv420p10, it doesn't work as expect for the normalize. 
why it's not real 10bit data? 


> Regards,
> Marton
> 
> >        diff = fabs(mafd - select->prev_mafd);
> >        ret  = av_clipf(FFMIN(mafd, diff) / 100., 0, 1);
> >        select->prev_mafd = mafd;
> >-- 
> >2.6.4
> >
> >_______________________________________________
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavfilter/f_select.c b/libavfilter/f_select.c
index 1132375..eed8df3 100644
--- a/libavfilter/f_select.c
+++ b/libavfilter/f_select.c
@@ -28,6 +28,8 @@ 
 #include "libavutil/fifo.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 #include "avfilter.h"
 #include "audio.h"
 #include "formats.h"
@@ -144,6 +146,10 @@  typedef struct SelectContext {
     char *expr_str;
     AVExpr *expr;
     double var_values[VAR_VARS_NB];
+    int bitdepth;
+    int nb_planes;
+    ptrdiff_t width[4];
+    ptrdiff_t height[4];
     int do_scene_detect;            ///< 1 if the expression requires scene detection variables, 0 otherwise
     ff_scene_sad_fn sad;            ///< Sum of the absolute difference function (scene detect only)
     double prev_mafd;               ///< previous MAFD                           (scene detect only)
@@ -202,6 +208,17 @@  static av_cold int init(AVFilterContext *ctx)
 static int config_input(AVFilterLink *inlink)
 {
     SelectContext *select = inlink->dst->priv;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
+
+    select->bitdepth = desc->comp[0].depth;
+    select->nb_planes = av_pix_fmt_count_planes(inlink->format);
+    for (int plane = 0; plane < select->nb_planes; plane++) {
+        ptrdiff_t line_size = av_image_get_linesize(inlink->format, inlink->w, plane);
+        int vsub = desc->log2_chroma_h;
+
+        select->width[plane] = line_size >> (select->bitdepth > 8);
+        select->height[plane] = plane == 1 || plane == 2 ?  AV_CEIL_RSHIFT(inlink->h, vsub) : inlink->h;
+    }
 
     select->var_values[VAR_N]          = 0.0;
     select->var_values[VAR_SELECTED_N] = 0.0;
@@ -242,7 +259,7 @@  static int config_input(AVFilterLink *inlink)
         inlink->type == AVMEDIA_TYPE_AUDIO ? inlink->sample_rate : NAN;
 
     if (CONFIG_SELECT_FILTER && select->do_scene_detect) {
-        select->sad = ff_scene_sad_get_fn(8);
+        select->sad = ff_scene_sad_get_fn(select->bitdepth == 8 ? 8 : 16);
         if (!select->sad)
             return AVERROR(EINVAL);
     }
@@ -258,12 +275,21 @@  static double get_scene_score(AVFilterContext *ctx, AVFrame *frame)
     if (prev_picref &&
         frame->height == prev_picref->height &&
         frame->width  == prev_picref->width) {
-        uint64_t sad;
+        uint64_t sad = 0;
         double mafd, diff;
+        int count = 0;
+
+        for (int plane = 0; plane < select->nb_planes; plane++) {
+            uint64_t plane_sad;
+            select->sad(prev_picref->data[plane], prev_picref->linesize[plane],
+                    frame->data[plane], frame->linesize[plane],
+                    select->width[plane], select->height[plane], &plane_sad);
+            sad += plane_sad;
+            count += select->width[plane] * select->height[plane];
+        }
 
-        select->sad(prev_picref->data[0], prev_picref->linesize[0], frame->data[0], frame->linesize[0], frame->width * 3, frame->height, &sad);
         emms_c();
-        mafd = (double)sad / (frame->width * 3 * frame->height);
+        mafd = (double)sad / count;
         diff = fabs(mafd - select->prev_mafd);
         ret  = av_clipf(FFMIN(mafd, diff) / 100., 0, 1);
         select->prev_mafd = mafd;