diff mbox

[FFmpeg-devel,1/3] avfilter/vf_framerate: factorize SAD functions which compute SAD for a whole frame

Message ID 20181104120606.4177-1-cus@passwd.hu
State Accepted
Commit 6c2a7a8e9a3698f37913d3f24723fbb8fa895798
Headers show

Commit Message

Marton Balint Nov. 4, 2018, 12:06 p.m. UTC
Also add SIMD which works on lines because it is faster then calculating it on
8x8 blocks using pixelutils.

Signed-off-by: Marton Balint <cus@passwd.hu>
---
 configure                        |  3 +-
 libavfilter/Makefile             |  1 +
 libavfilter/framerate.h          |  4 +--
 libavfilter/scene_sad.c          | 72 ++++++++++++++++++++++++++++++++++++++
 libavfilter/scene_sad.h          | 44 ++++++++++++++++++++++++
 libavfilter/vf_framerate.c       | 61 ++++-----------------------------
 libavfilter/x86/Makefile         |  4 +++
 libavfilter/x86/scene_sad.asm    | 74 ++++++++++++++++++++++++++++++++++++++++
 libavfilter/x86/scene_sad_init.c | 52 ++++++++++++++++++++++++++++
 9 files changed, 257 insertions(+), 58 deletions(-)
 create mode 100644 libavfilter/scene_sad.c
 create mode 100644 libavfilter/scene_sad.h
 create mode 100644 libavfilter/x86/scene_sad.asm
 create mode 100644 libavfilter/x86/scene_sad_init.c

Comments

James Almer Nov. 4, 2018, 1:28 p.m. UTC | #1
On 11/4/2018 9:06 AM, Marton Balint wrote:
> Also add SIMD which works on lines because it is faster then calculating it on
> 8x8 blocks using pixelutils.
> 
> Signed-off-by: Marton Balint <cus@passwd.hu>
> ---
>  configure                        |  3 +-
>  libavfilter/Makefile             |  1 +
>  libavfilter/framerate.h          |  4 +--
>  libavfilter/scene_sad.c          | 72 ++++++++++++++++++++++++++++++++++++++
>  libavfilter/scene_sad.h          | 44 ++++++++++++++++++++++++
>  libavfilter/vf_framerate.c       | 61 ++++-----------------------------
>  libavfilter/x86/Makefile         |  4 +++
>  libavfilter/x86/scene_sad.asm    | 74 ++++++++++++++++++++++++++++++++++++++++
>  libavfilter/x86/scene_sad_init.c | 52 ++++++++++++++++++++++++++++
>  9 files changed, 257 insertions(+), 58 deletions(-)
>  create mode 100644 libavfilter/scene_sad.c
>  create mode 100644 libavfilter/scene_sad.h
>  create mode 100644 libavfilter/x86/scene_sad.asm
>  create mode 100644 libavfilter/x86/scene_sad_init.c
> 
> diff --git a/configure b/configure
> index 2606b885b0..f5bec9fd62 100755
> --- a/configure
> +++ b/configure
> @@ -2335,6 +2335,7 @@ CONFIG_EXTRA="
>      rtpdec
>      rtpenc_chain
>      rv34dsp
> +    scene_sad
>      sinewin
>      snappy
>      srtp
> @@ -3395,7 +3396,7 @@ find_rect_filter_deps="avcodec avformat gpl"
>  firequalizer_filter_deps="avcodec"
>  firequalizer_filter_select="rdft"
>  flite_filter_deps="libflite"
> -framerate_filter_select="pixelutils"
> +framerate_filter_select="scene_sad"
>  frei0r_filter_deps="frei0r libdl"
>  frei0r_src_filter_deps="frei0r libdl"
>  fspp_filter_deps="gpl"
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 51e48efc2e..390c2b7997 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -28,6 +28,7 @@ OBJS-$(HAVE_THREADS)                         += pthread.o
>  OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
>  DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn_backend_tf.o
>  OBJS-$(CONFIG_DNN)                           += dnn_interface.o dnn_backend_native.o $(DNN-OBJS-yes)
> +OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
>  
>  # audio filters
>  OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o

Can't you expand pixeutils instead? Adding a new set of functions that
work on lines rather than blocks.
Marton Balint Nov. 4, 2018, 5:45 p.m. UTC | #2
On Sun, 4 Nov 2018, James Almer wrote:

> On 11/4/2018 9:06 AM, Marton Balint wrote:
>> Also add SIMD which works on lines because it is faster then calculating it on
>> 8x8 blocks using pixelutils.
>> 
>> Signed-off-by: Marton Balint <cus@passwd.hu>
>> ---
>>  configure                        |  3 +-
>>  libavfilter/Makefile             |  1 +
>>  libavfilter/framerate.h          |  4 +--
>>  libavfilter/scene_sad.c          | 72 ++++++++++++++++++++++++++++++++++++++
>>  libavfilter/scene_sad.h          | 44 ++++++++++++++++++++++++
>>  libavfilter/vf_framerate.c       | 61 ++++-----------------------------
>>  libavfilter/x86/Makefile         |  4 +++
>>  libavfilter/x86/scene_sad.asm    | 74 ++++++++++++++++++++++++++++++++++++++++
>>  libavfilter/x86/scene_sad_init.c | 52 ++++++++++++++++++++++++++++
>>  9 files changed, 257 insertions(+), 58 deletions(-)
>>  create mode 100644 libavfilter/scene_sad.c
>>  create mode 100644 libavfilter/scene_sad.h
>>  create mode 100644 libavfilter/x86/scene_sad.asm
>>  create mode 100644 libavfilter/x86/scene_sad_init.c
>> 
>> diff --git a/configure b/configure
>> index 2606b885b0..f5bec9fd62 100755
>> --- a/configure
>> +++ b/configure
>> @@ -2335,6 +2335,7 @@ CONFIG_EXTRA="
>>      rtpdec
>>      rtpenc_chain
>>      rv34dsp
>> +    scene_sad
>>      sinewin
>>      snappy
>>      srtp
>> @@ -3395,7 +3396,7 @@ find_rect_filter_deps="avcodec avformat gpl"
>>  firequalizer_filter_deps="avcodec"
>>  firequalizer_filter_select="rdft"
>>  flite_filter_deps="libflite"
>> -framerate_filter_select="pixelutils"
>> +framerate_filter_select="scene_sad"
>>  frei0r_filter_deps="frei0r libdl"
>>  frei0r_src_filter_deps="frei0r libdl"
>>  fspp_filter_deps="gpl"
>> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>> index 51e48efc2e..390c2b7997 100644
>> --- a/libavfilter/Makefile
>> +++ b/libavfilter/Makefile
>> @@ -28,6 +28,7 @@ OBJS-$(HAVE_THREADS)                         += pthread.o
>>  OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
>>  DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn_backend_tf.o
>>  OBJS-$(CONFIG_DNN)                           += dnn_interface.o dnn_backend_native.o $(DNN-OBJS-yes)
>> +OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
>>
>>  # audio filters
>>  OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
>
> Can't you expand pixeutils instead? Adding a new set of functions that
> work on lines rather than blocks.

That is kind of intentional, because it is only used by libavfilter, so I 
did not want to bloat libavutil with it. Also if I put it into libavutil 
then the interfaces have to be public. I tried to avoid that because for a 
generic scene SAD function bitdepth is not enough, you need endianness, 
float/int color support, etc. Public API should be finalized when 
something actually uses it IMHO.

Regards,
Marton
Marton Balint Nov. 10, 2018, 6:38 p.m. UTC | #3
On Sun, 4 Nov 2018, Marton Balint wrote:

>
> On Sun, 4 Nov 2018, James Almer wrote:
>
>> On 11/4/2018 9:06 AM, Marton Balint wrote:
>>> Also add SIMD which works on lines because it is faster then calculating 
> it on
>>> 8x8 blocks using pixelutils.
>>> 
>>> Signed-off-by: Marton Balint <cus@passwd.hu>
>>> ---
>>>  configure                        |  3 +-
>>>  libavfilter/Makefile             |  1 +
>>>  libavfilter/framerate.h          |  4 +--
>>>  libavfilter/scene_sad.c          | 72 
> ++++++++++++++++++++++++++++++++++++++
>>>  libavfilter/scene_sad.h          | 44 ++++++++++++++++++++++++
>>>  libavfilter/vf_framerate.c       | 61 ++++-----------------------------
>>>  libavfilter/x86/Makefile         |  4 +++
>>>  libavfilter/x86/scene_sad.asm    | 74 
> ++++++++++++++++++++++++++++++++++++++++
>>>  libavfilter/x86/scene_sad_init.c | 52 ++++++++++++++++++++++++++++
>>>  9 files changed, 257 insertions(+), 58 deletions(-)
>>>  create mode 100644 libavfilter/scene_sad.c
>>>  create mode 100644 libavfilter/scene_sad.h
>>>  create mode 100644 libavfilter/x86/scene_sad.asm
>>>  create mode 100644 libavfilter/x86/scene_sad_init.c
>>> 
>>> diff --git a/configure b/configure
>>> index 2606b885b0..f5bec9fd62 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -2335,6 +2335,7 @@ CONFIG_EXTRA="
>>>      rtpdec
>>>      rtpenc_chain
>>>      rv34dsp
>>> +    scene_sad
>>>      sinewin
>>>      snappy
>>>      srtp
>>> @@ -3395,7 +3396,7 @@ find_rect_filter_deps="avcodec avformat gpl"
>>>  firequalizer_filter_deps="avcodec"
>>>  firequalizer_filter_select="rdft"
>>>  flite_filter_deps="libflite"
>>> -framerate_filter_select="pixelutils"
>>> +framerate_filter_select="scene_sad"
>>>  frei0r_filter_deps="frei0r libdl"
>>>  frei0r_src_filter_deps="frei0r libdl"
>>>  fspp_filter_deps="gpl"
>>> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>>> index 51e48efc2e..390c2b7997 100644
>>> --- a/libavfilter/Makefile
>>> +++ b/libavfilter/Makefile
>>> @@ -28,6 +28,7 @@ OBJS-$(HAVE_THREADS)                         += 
> pthread.o
>>>  OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
>>>  DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn_backend_tf.o
>>>  OBJS-$(CONFIG_DNN)                           += dnn_interface.o 
> dnn_backend_native.o $(DNN-OBJS-yes)
>>> +OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
>>>
>>>  # audio filters
>>>  OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
>>
>> Can't you expand pixeutils instead? Adding a new set of functions that
>> work on lines rather than blocks.
>
> That is kind of intentional, because it is only used by libavfilter, so I 
> did not want to bloat libavutil with it. Also if I put it into libavutil 
> then the interfaces have to be public. I tried to avoid that because for a 
> generic scene SAD function bitdepth is not enough, you need endianness, 
> float/int color support, etc. Public API should be finalized when 
> something actually uses it IMHO.

I plan to push this soon.

Regards,
Marton
Marton Balint Nov. 11, 2018, 7:36 p.m. UTC | #4
On Sat, 10 Nov 2018, Marton Balint wrote:

>
> On Sun, 4 Nov 2018, Marton Balint wrote:
>
>>
>> On Sun, 4 Nov 2018, James Almer wrote:
>>
>>> On 11/4/2018 9:06 AM, Marton Balint wrote:
>>>> Also add SIMD which works on lines because it is faster then calculating 
>> it on
>>>> 8x8 blocks using pixelutils.
>>>> 
>>>> Signed-off-by: Marton Balint <cus@passwd.hu>
>>>> ---
>>>>  configure                        |  3 +-
>>>>  libavfilter/Makefile             |  1 +
>>>>  libavfilter/framerate.h          |  4 +--
>>>>  libavfilter/scene_sad.c          | 72 
>> ++++++++++++++++++++++++++++++++++++++
>>>>  libavfilter/scene_sad.h          | 44 ++++++++++++++++++++++++
>>>>  libavfilter/vf_framerate.c       | 61 ++++-----------------------------
>>>>  libavfilter/x86/Makefile         |  4 +++
>>>>  libavfilter/x86/scene_sad.asm    | 74 
>> ++++++++++++++++++++++++++++++++++++++++
>>>>  libavfilter/x86/scene_sad_init.c | 52 ++++++++++++++++++++++++++++
>>>>  9 files changed, 257 insertions(+), 58 deletions(-)
>>>>  create mode 100644 libavfilter/scene_sad.c
>>>>  create mode 100644 libavfilter/scene_sad.h
>>>>  create mode 100644 libavfilter/x86/scene_sad.asm
>>>>  create mode 100644 libavfilter/x86/scene_sad_init.c
>>>> 
>>>> diff --git a/configure b/configure
>>>> index 2606b885b0..f5bec9fd62 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -2335,6 +2335,7 @@ CONFIG_EXTRA="
>>>>      rtpdec
>>>>      rtpenc_chain
>>>>      rv34dsp
>>>> +    scene_sad
>>>>      sinewin
>>>>      snappy
>>>>      srtp
>>>> @@ -3395,7 +3396,7 @@ find_rect_filter_deps="avcodec avformat gpl"
>>>>  firequalizer_filter_deps="avcodec"
>>>>  firequalizer_filter_select="rdft"
>>>>  flite_filter_deps="libflite"
>>>> -framerate_filter_select="pixelutils"
>>>> +framerate_filter_select="scene_sad"
>>>>  frei0r_filter_deps="frei0r libdl"
>>>>  frei0r_src_filter_deps="frei0r libdl"
>>>>  fspp_filter_deps="gpl"
>>>> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>>>> index 51e48efc2e..390c2b7997 100644
>>>> --- a/libavfilter/Makefile
>>>> +++ b/libavfilter/Makefile
>>>> @@ -28,6 +28,7 @@ OBJS-$(HAVE_THREADS)                         += 
>> pthread.o
>>>>  OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
>>>>  DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn_backend_tf.o
>>>>  OBJS-$(CONFIG_DNN)                           += dnn_interface.o 
>> dnn_backend_native.o $(DNN-OBJS-yes)
>>>> +OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
>>>>
>>>>  # audio filters
>>>>  OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
>>>
>>> Can't you expand pixeutils instead? Adding a new set of functions that
>>> work on lines rather than blocks.
>>
>> That is kind of intentional, because it is only used by libavfilter, so I 
>> did not want to bloat libavutil with it. Also if I put it into libavutil 
>> then the interfaces have to be public. I tried to avoid that because for a 
>> generic scene SAD function bitdepth is not enough, you need endianness, 
>> float/int color support, etc. Public API should be finalized when 
>> something actually uses it IMHO.
>
> I plan to push this soon.

Pushed.

Regards,
Marton
diff mbox

Patch

diff --git a/configure b/configure
index 2606b885b0..f5bec9fd62 100755
--- a/configure
+++ b/configure
@@ -2335,6 +2335,7 @@  CONFIG_EXTRA="
     rtpdec
     rtpenc_chain
     rv34dsp
+    scene_sad
     sinewin
     snappy
     srtp
@@ -3395,7 +3396,7 @@  find_rect_filter_deps="avcodec avformat gpl"
 firequalizer_filter_deps="avcodec"
 firequalizer_filter_select="rdft"
 flite_filter_deps="libflite"
-framerate_filter_select="pixelutils"
+framerate_filter_select="scene_sad"
 frei0r_filter_deps="frei0r libdl"
 frei0r_src_filter_deps="frei0r libdl"
 fspp_filter_deps="gpl"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 51e48efc2e..390c2b7997 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -28,6 +28,7 @@  OBJS-$(HAVE_THREADS)                         += pthread.o
 OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
 DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn_backend_tf.o
 OBJS-$(CONFIG_DNN)                           += dnn_interface.o dnn_backend_native.o $(DNN-OBJS-yes)
+OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
 
 # audio filters
 OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
diff --git a/libavfilter/framerate.h b/libavfilter/framerate.h
index a42d5af68a..8048dfa36a 100644
--- a/libavfilter/framerate.h
+++ b/libavfilter/framerate.h
@@ -19,7 +19,7 @@ 
 #ifndef AVFILTER_FRAMERATE_H
 #define AVFILTER_FRAMERATE_H
 
-#include "libavutil/pixelutils.h"
+#include "scene_sad.h"
 #include "avfilter.h"
 
 #define BLEND_FUNC_PARAMS const uint8_t *src1, ptrdiff_t src1_linesize, \
@@ -48,7 +48,7 @@  typedef struct FrameRateContext {
     AVRational srce_time_base;          ///< timebase of source
     AVRational dest_time_base;          ///< timebase of destination
 
-    av_pixelutils_sad_fn sad;           ///< Sum of the absolute difference function (scene detect only)
+    ff_scene_sad_fn sad;                ///< Sum of the absolute difference function (scene detect only)
     double prev_mafd;                   ///< previous MAFD                           (scene detect only)
 
     int blend_factor_max;
diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
new file mode 100644
index 0000000000..fa57a25961
--- /dev/null
+++ b/libavfilter/scene_sad.c
@@ -0,0 +1,72 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Scene SAD funtions
+ */
+
+#include "scene_sad.h"
+
+void ff_scene_sad16_c(SCENE_SAD_PARAMS)
+{
+    uint64_t sad = 0;
+    const uint16_t *src1w = (const uint16_t *)src1;
+    const uint16_t *src2w = (const uint16_t *)src2;
+    int x, y;
+
+    stride1 /= 2;
+    stride2 /= 2;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            sad += FFABS(src1w[x] - src2w[x]);
+        src1w += stride1;
+        src2w += stride2;
+    }
+    *sum = sad;
+}
+
+void ff_scene_sad_c(SCENE_SAD_PARAMS)
+{
+    uint64_t sad = 0;
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            sad += FFABS(src1[x] - src2[x]);
+        src1 += stride1;
+        src2 += stride2;
+    }
+    *sum = sad;
+}
+
+ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
+{
+    ff_scene_sad_fn sad = NULL;
+    if (ARCH_X86)
+        sad = ff_scene_sad_get_fn_x86(depth);
+    if (!sad) {
+        if (depth == 8)
+            sad = ff_scene_sad_c;
+        if (depth == 16)
+            sad = ff_scene_sad16_c;
+    }
+    return sad;
+}
+
diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
new file mode 100644
index 0000000000..433c69473d
--- /dev/null
+++ b/libavfilter/scene_sad.h
@@ -0,0 +1,44 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Scene SAD funtions
+ */
+
+#ifndef AVFILTER_SCENE_SAD_H
+#define AVFILTER_SCENE_SAD_H
+
+#include "avfilter.h"
+
+#define SCENE_SAD_PARAMS const uint8_t *src1, ptrdiff_t stride1, \
+                         const uint8_t *src2, ptrdiff_t stride2, \
+                         ptrdiff_t width, ptrdiff_t height, \
+                         uint64_t *sum
+
+typedef void (*ff_scene_sad_fn)(SCENE_SAD_PARAMS);
+
+void ff_scene_sad_c(SCENE_SAD_PARAMS);
+
+void ff_scene_sad16_c(SCENE_SAD_PARAMS);
+
+ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
+
+ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
+
+#endif /* AVFILTER_SCENE_SAD_H */
diff --git a/libavfilter/vf_framerate.c b/libavfilter/vf_framerate.c
index fb65381923..06e463e4d7 100644
--- a/libavfilter/vf_framerate.c
+++ b/libavfilter/vf_framerate.c
@@ -33,13 +33,13 @@ 
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
-#include "libavutil/pixelutils.h"
 
 #include "avfilter.h"
 #include "internal.h"
 #include "video.h"
 #include "filters.h"
 #include "framerate.h"
+#include "scene_sad.h"
 
 #define OFFSET(x) offsetof(FrameRateContext, x)
 #define V AV_OPT_FLAG_VIDEO_PARAM
@@ -62,52 +62,6 @@  static const AVOption framerate_options[] = {
 
 AVFILTER_DEFINE_CLASS(framerate);
 
-static av_always_inline int64_t sad_8x8_16(const uint16_t *src1, ptrdiff_t stride1,
-                                           const uint16_t *src2, ptrdiff_t stride2)
-{
-    int sum = 0;
-    int x, y;
-
-    for (y = 0; y < 8; y++) {
-        for (x = 0; x < 8; x++)
-            sum += FFABS(src1[x] - src2[x]);
-        src1 += stride1;
-        src2 += stride2;
-    }
-    return sum;
-}
-
-static int64_t scene_sad16(FrameRateContext *s, const uint16_t *p1, int p1_linesize, const uint16_t* p2, int p2_linesize, const int width, const int height)
-{
-    int64_t sad;
-    int x, y;
-    for (sad = y = 0; y < height - 7; y += 8) {
-        for (x = 0; x < width - 7; x += 8) {
-            sad += sad_8x8_16(p1 + y * p1_linesize + x,
-                              p1_linesize,
-                              p2 + y * p2_linesize + x,
-                              p2_linesize);
-        }
-    }
-    return sad;
-}
-
-static int64_t scene_sad8(FrameRateContext *s, uint8_t *p1, int p1_linesize, uint8_t* p2, int p2_linesize, const int width, const int height)
-{
-    int64_t sad;
-    int x, y;
-    for (sad = y = 0; y < height - 7; y += 8) {
-        for (x = 0; x < width - 7; x += 8) {
-            sad += s->sad(p1 + y * p1_linesize + x,
-                          p1_linesize,
-                          p2 + y * p2_linesize + x,
-                          p2_linesize);
-        }
-    }
-    emms_c();
-    return sad;
-}
-
 static double get_scene_score(AVFilterContext *ctx, AVFrame *crnt, AVFrame *next)
 {
     FrameRateContext *s = ctx->priv;
@@ -117,16 +71,13 @@  static double get_scene_score(AVFilterContext *ctx, AVFrame *crnt, AVFrame *next
 
     if (crnt->height == next->height &&
         crnt->width  == next->width) {
-        int64_t sad;
+        uint64_t sad;
         double mafd, diff;
 
         ff_dlog(ctx, "get_scene_score() process\n");
-        if (s->bitdepth == 8)
-            sad = scene_sad8(s, crnt->data[0], crnt->linesize[0], next->data[0], next->linesize[0], crnt->width, crnt->height);
-        else
-            sad = scene_sad16(s, (const uint16_t*)crnt->data[0], crnt->linesize[0] / 2, (const uint16_t*)next->data[0], next->linesize[0] / 2, crnt->width, crnt->height);
-
-        mafd = (double)sad * 100.0 / FFMAX(1, (crnt->height & ~7) * (crnt->width & ~7)) / (1 << s->bitdepth);
+        s->sad(crnt->data[0], crnt->linesize[0], next->data[0], next->linesize[0], crnt->width, crnt->height, &sad);
+        emms_c();
+        mafd = (double)sad * 100.0 / (crnt->width * crnt->height) / (1 << s->bitdepth);
         diff = fabs(mafd - s->prev_mafd);
         ret  = av_clipf(FFMIN(mafd, diff), 0, 100.0);
         s->prev_mafd = mafd;
@@ -350,7 +301,7 @@  static int config_input(AVFilterLink *inlink)
     s->bitdepth = pix_desc->comp[0].depth;
     s->vsub = pix_desc->log2_chroma_h;
 
-    s->sad = av_pixelutils_get_sad_fn(3, 3, 2, s); // 8x8 both sources aligned
+    s->sad = ff_scene_sad_get_fn(s->bitdepth == 8 ? 8 : 16);
     if (!s->sad)
         return AVERROR(EINVAL);
 
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index b484c8bd1c..6eecb94359 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,3 +1,5 @@ 
+OBJS-$(CONFIG_SCENE_SAD)                     += x86/scene_sad_init.o
+
 OBJS-$(CONFIG_AFIR_FILTER)                   += x86/af_afir_init.o
 OBJS-$(CONFIG_BLEND_FILTER)                  += x86/vf_blend_init.o
 OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
@@ -29,6 +31,8 @@  OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
+X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
+
 X86ASM-OBJS-$(CONFIG_AFIR_FILTER)            += x86/af_afir.o
 X86ASM-OBJS-$(CONFIG_BLEND_FILTER)           += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
diff --git a/libavfilter/x86/scene_sad.asm b/libavfilter/x86/scene_sad.asm
new file mode 100644
index 0000000000..d38d71ccca
--- /dev/null
+++ b/libavfilter/x86/scene_sad.asm
@@ -0,0 +1,74 @@ 
+;*****************************************************************************
+;* x86-optimized functions for scene SAD
+;*
+;* Copyright (C) 2018 Marton Balint
+;*
+;* Based on vf_blend.asm, Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+
+%macro SAD_INIT 0
+cglobal scene_sad, 6, 7, 2, src1, stride1, src2, stride2, width, end, x
+    add     src1q, widthq
+    add     src2q, widthq
+    neg    widthq
+    pxor       m1, m1
+%endmacro
+
+
+%macro SAD_LOOP 0
+.nextrow:
+    mov        xq, widthq
+
+    .loop:
+        movu            m0, [src1q + xq]
+        psadbw          m0, [src2q + xq]
+        paddq           m1, m0
+        add             xq, mmsize
+    jl .loop
+    add     src1q, stride1q
+    add     src2q, stride2q
+    sub      endd, 1
+    jg .nextrow
+
+    mov         r0q, r6mp
+    movu      [r0q], m1      ; sum
+REP_RET
+%endmacro
+
+
+%macro SAD_FRAMES 0
+    SAD_INIT
+    SAD_LOOP
+%endmacro
+
+
+INIT_XMM sse2
+SAD_FRAMES
+
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+SAD_FRAMES
+
+%endif
diff --git a/libavfilter/x86/scene_sad_init.c b/libavfilter/x86/scene_sad_init.c
new file mode 100644
index 0000000000..461fa406d9
--- /dev/null
+++ b/libavfilter/x86/scene_sad_init.c
@@ -0,0 +1,52 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/scene_sad.h"
+
+#define SCENE_SAD_FUNC(FUNC_NAME, ASM_FUNC_NAME, MMSIZE)                      \
+void ASM_FUNC_NAME(SCENE_SAD_PARAMS);                                         \
+                                                                              \
+static void FUNC_NAME(SCENE_SAD_PARAMS) {                                     \
+    uint64_t sad[MMSIZE / 8] = {0};                                           \
+    ptrdiff_t awidth = width & ~(MMSIZE - 1);                                 \
+    *sum = 0;                                                                 \
+    ASM_FUNC_NAME(src1, stride1, src2, stride2, awidth, height, sad);         \
+    for (int i = 0; i < MMSIZE / 8; i++)                                      \
+        *sum += sad[i];                                                       \
+    ff_scene_sad_c(src1 + awidth, stride1,                                    \
+                   src2 + awidth, stride2,                                    \
+                   width - awidth, height, sad);                              \
+    *sum += sad[0];                                                           \
+}
+
+SCENE_SAD_FUNC(scene_sad_sse2, ff_scene_sad_sse2, 16);
+SCENE_SAD_FUNC(scene_sad_avx2, ff_scene_sad_avx2, 32);
+
+ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (depth == 8) {
+        if (EXTERNAL_AVX2_FAST(cpu_flags))
+            return scene_sad_avx2;
+        else if (EXTERNAL_SSE2(cpu_flags))
+            return scene_sad_sse2;
+    }
+    return NULL;
+}