[FFmpeg-devel] avfilter: add vmafmotion filter

Submitted by Ashish Singh on Sept. 6, 2017, 8:04 p.m.

Details

Message ID 1504728287-16766-1-git-send-email-ashk43712@gmail.com
State New
Headers show

Commit Message

Ashish Singh Sept. 6, 2017, 8:04 p.m.
From: Ashish Singh <ashk43712@gmail.com>

Hi, this patch changes previous one to framesync2.
SIMD is a work in progress for this filter.

Signed-off-by: Ashish Singh <ashk43712@gmail.com>
---
 Changelog                   |   1 +
 doc/filters.texi            |  16 ++
 libavfilter/Makefile        |   1 +
 libavfilter/allfilters.c    |   1 +
 libavfilter/vf_vmafmotion.c | 413 ++++++++++++++++++++++++++++++++++++++++++++
 libavfilter/vmaf_motion.h   |  42 +++++
 6 files changed, 474 insertions(+)
 create mode 100644 libavfilter/vf_vmafmotion.c
 create mode 100644 libavfilter/vmaf_motion.h

Comments

Ronald S. Bultje Sept. 8, 2017, 3:38 p.m.
Hi,

On Wed, Sep 6, 2017 at 4:04 PM, Ashish Pratap Singh <ashk43712@gmail.com>
wrote:

> +#define MAX_ALIGN 32
> +#define ALIGN_CEIL(x) ((x) + ((x) % MAX_ALIGN ? MAX_ALIGN - (x) %
> MAX_ALIGN : 0))
>

../libavutil/macros.h:#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))

>
> +static const AVOption vmafmotion_options[] = {
> +    { NULL }
> +};
>

#define vmafmotion_options NULL?


> +FRAMESYNC_DEFINE_CLASS(vmafmotion, VMAFMotionContext, fs);
>

src/libavfilter/vf_vmafmotion.c:59:1: warning: unused function
'vmafmotion_framesync_preinit' [-Wunused-function]
FRAMESYNC_DEFINE_CLASS(vmafmotion, VMAFMotionContext, fs);
^
src/libavfilter/framesync2.h:298:12: note: expanded from macro
'FRAMESYNC_DEFINE_CLASS'
static int name##_framesync_preinit(AVFilterContext *ctx) { \
           ^
<scratch space>:44:1: note: expanded from here
vmafmotion_framesync_preinit


> +static inline int floorn(int n, int m)
> +{
> +    return n - n % m;
> +}
> +
> +static inline int ceiln(int n, int m)
> +{
> +    return n % m ? n + (m - n % m) : n;
> +}
> +
> +static void convolution_x(const int *filter, int filt_w, const uint16_t
> *src,
> +                          uint16_t *dst, int w, int h, ptrdiff_t
> src_stride,
> +                          ptrdiff_t dst_stride)
>

I would move sad() convolution_x/y() as function pointers into a
VMAFMotionDSPContext.

In ffmpeg, we always express "stride" in bytes, not in pixels. In SIMD, it
will become clear why. Same comment for _y.


> +{
> +    int radius = filt_w / 2;
> +    int borders_left = ceiln(radius, 1);
> +    int borders_right = floorn(w - (filt_w - radius), 1);
>

For cases where m is 1, ceiln() and floorn() just return n, so this can be
simplified. Same comment for _y.


> +#define conv_y_fn(type, bits) \
> +    static void convolution_y_##bits##bit(const int *filter, int filt_w,
> \
> +                                          const type *src, uint16_t *dst,
> \
> +                                          int w, int h, ptrdiff_t
> src_stride, \
> +                                          ptrdiff_t dst_stride) \
> +{ \
> +    int radius = filt_w / 2; \
> +    int borders_top = ceiln(radius, 1); \
> +    int borders_bottom = floorn(h - (filt_w - radius), 1); \
> +    int i, j, k; \
> +    int sum = 0; \
> +    \
> +    for (i = 0; i < borders_top; i++) { \
> +        for (j = 0; j < w; j++) { \
> +            sum = 0; \
> +            for (k = 0; k < filt_w; k++) { \
> +                int i_tap = FFABS(i - radius + k); \
> +                if (i_tap >= h) { \
> +                    i_tap = h - (i_tap - h + 1); \
> +                } \
> +                sum += filter[k] * src[i_tap * src_stride + j]; \
> +            } \
> +            dst[i * dst_stride + j] = sum >> N; \
>

Same comment for bottom 2: I would decrease the shift in this expression by
a few bits. The reason is that you can use the extra bits as fractional
bits to increase precision. E.g. for 8-bits, the input is 8bits and the
filter is 15, but the uint16_t can hold 16 bits (let's say 15 so we can use
signed instructions also) data, so instead of 8*15>>15=8 bits, we can shift
by just 8bits to get 15 bits (full-width) uint16_t. Same for 10-bit, where
we can shift by 10 instead of 15.

The sad calculation then indeed has to downshift the result again (*score =
(double) (sad * 1.0 / (w * h)); becomes sad * 1.0 / (w * h << (15-bits)).


> +void convolution_f32(const int *filter, int filt_w, const void *src,
> +                     uint16_t *dst, uint16_t *tmp, int w, int h,
> +                     ptrdiff_t src_stride, ptrdiff_t dst_stride, uint8_t
> type)
> +{
> +    if(type == 8) {
>

Rename "type" to bitdepth.


> +    if (s->desc->comp[0].depth <= 8) {
> +        ref_px_stride = ref_stride / sizeof(uint8_t);
> +        convolution_f32(s->filter, 5, (const uint8_t *) ref->data[0],
> +                        s->blur_data, s->temp_data, s->width, s->height,
> +                        ref_px_stride, px_stride, 8);
> +    } else {
> +        ref_px_stride = ref_stride / sizeof(uint16_t);
> +        convolution_f32(s->filter, 5, (const uint16_t *) ref->data[0],
> +                        s->blur_data, s->temp_data, s->width, s->height,
> +                        ref_px_stride, px_stride, 10);
> +    }
>

The if and resulting cast are unnecessary f you express stride in bytes
instead of pixels.

+    memcpy(s->prev_blur_data, s->blur_data, data_sz);
>

Hm... Try FFSWAP with pointers.


> +static int query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV420P,
> +        AV_PIX_FMT_YUV444P10LE, AV_PIX_FMT_YUV422P10LE,
> AV_PIX_FMT_YUV420P10LE,
>

Remove the LE suffix, this filter handles native endian.


> +static int activate(AVFilterContext *ctx)
> +{
> +    VMAFMotionContext *s = ctx->priv;
> +    return ff_framesync2_activate(&s->fs);
> +}
> +
> +
> +static av_cold void uninit(AVFilterContext *ctx)
>

One newline too many in between the two functions.


> +#ifndef MOTION_TOOLS_H_
> +#define MOTION_TOOLS_H_
>

AVFILTER_VMAFMOTION_H


> +#define N 15
> +
> +static const float FILTER_5[5] = {
> +    0.054488685,
> +    0.244201342,
> +    0.402619947,
> +    0.244201342,
> +    0.054488685
> +};
>

>
+void convolution_f32(const int *filter, int filt_width, const void *src,
> +                     uint16_t *dst, uint16_t *tmp, int w, int h,
> +                     ptrdiff_t src_stride, ptrdiff_t dst_stride, uint8_t
> type);
>

These 3 should be in the C file. I would say N is a very generic name but
if it compiles I guess it's fine :-).

Ronald
Ronald S. Bultje Sept. 13, 2017, 1:10 p.m.
Hi,

On Wed, Sep 6, 2017 at 4:04 PM, Ashish Pratap Singh <ashk43712@gmail.com>
wrote:

> From: Ashish Singh <ashk43712@gmail.com>
>
> Hi, this patch changes previous one to framesync2.
> SIMD is a work in progress for this filter.
>
> Signed-off-by: Ashish Singh <ashk43712@gmail.com>
> ---
>  Changelog                   |   1 +
>  doc/filters.texi            |  16 ++
>  libavfilter/Makefile        |   1 +
>  libavfilter/allfilters.c    |   1 +
>  libavfilter/vf_vmafmotion.c | 413 ++++++++++++++++++++++++++++++
> ++++++++++++++
>  libavfilter/vmaf_motion.h   |  42 +++++
>  6 files changed, 474 insertions(+)
>  create mode 100644 libavfilter/vf_vmafmotion.c
>  create mode 100644 libavfilter/vmaf_motion.h


One more general comment on this filter: it appears to me that the motion
is calculated based on the reference, and we only use the "main" to apply
the metadata to. Although this makes sense from the "vmaf" filter
perspective, I'm actually wondering if - from the perspective of the
"vmafmotion" filter alone, it wouldn't be simpler to just have a single
filterpad input ("reference") and apply the metadata on it (when used by
itself). (The "vmaf" filter could still apply metadata on the "main").

What do people think? Would you prefer the "vmaf" and "vmafmotion" to
consistently apply the metadata on the "main" video frames, or would you
prefer that the "vmafmotion" filter more accurately presents which frame is
used for the motion scoring, which also happens to lead to simpler code /
filterchains?

Ronald
Ashish Singh Sept. 13, 2017, 1:34 p.m.
Hi,

On Wed, Sep 13, 2017 at 6:40 PM, Ronald S. Bultje <rsbultje@gmail.com>
wrote:

> Hi,
>
> On Wed, Sep 6, 2017 at 4:04 PM, Ashish Pratap Singh <ashk43712@gmail.com>
> wrote:
>
>> From: Ashish Singh <ashk43712@gmail.com>
>>
>> Hi, this patch changes previous one to framesync2.
>> SIMD is a work in progress for this filter.
>>
>> Signed-off-by: Ashish Singh <ashk43712@gmail.com>
>> ---
>>  Changelog                   |   1 +
>>  doc/filters.texi            |  16 ++
>>  libavfilter/Makefile        |   1 +
>>  libavfilter/allfilters.c    |   1 +
>>  libavfilter/vf_vmafmotion.c | 413 ++++++++++++++++++++++++++++++
>> ++++++++++++++
>>  libavfilter/vmaf_motion.h   |  42 +++++
>>  6 files changed, 474 insertions(+)
>>  create mode 100644 libavfilter/vf_vmafmotion.c
>>  create mode 100644 libavfilter/vmaf_motion.h
>
>
> One more general comment on this filter: it appears to me that the motion
> is calculated based on the reference, and we only use the "main" to apply
> the metadata to. Although this makes sense from the "vmaf" filter
> perspective, I'm actually wondering if - from the perspective of the
> "vmafmotion" filter alone, it wouldn't be simpler to just have a single
> filterpad input ("reference") and apply the metadata on it (when used by
> itself). (The "vmaf" filter could still apply metadata on the "main").
>
> What do people think? Would you prefer the "vmaf" and "vmafmotion" to
> consistently apply the metadata on the "main" video frames, or would you
> prefer that the "vmafmotion" filter more accurately presents which frame is
> used for the motion scoring, which also happens to lead to simpler code /
> filterchains?
>
> Ronald
>

yeah, you are right. In this filter only reference video is considered. So
I can make it a single filterpad input if everything is fine.
Tobias Rapp Sept. 13, 2017, 2:09 p.m.
On 13.09.2017 15:10, Ronald S. Bultje wrote:
> Hi,
> 
> On Wed, Sep 6, 2017 at 4:04 PM, Ashish Pratap Singh <ashk43712@gmail.com>
> wrote:
> 
>> From: Ashish Singh <ashk43712@gmail.com>
>>
>> Hi, this patch changes previous one to framesync2.
>> SIMD is a work in progress for this filter.
>>
>> Signed-off-by: Ashish Singh <ashk43712@gmail.com>
>> ---
>>   Changelog                   |   1 +
>>   doc/filters.texi            |  16 ++
>>   libavfilter/Makefile        |   1 +
>>   libavfilter/allfilters.c    |   1 +
>>   libavfilter/vf_vmafmotion.c | 413 ++++++++++++++++++++++++++++++
>> ++++++++++++++
>>   libavfilter/vmaf_motion.h   |  42 +++++
>>   6 files changed, 474 insertions(+)
>>   create mode 100644 libavfilter/vf_vmafmotion.c
>>   create mode 100644 libavfilter/vmaf_motion.h
> 
> 
> One more general comment on this filter: it appears to me that the motion
> is calculated based on the reference, and we only use the "main" to apply
> the metadata to. Although this makes sense from the "vmaf" filter
> perspective, I'm actually wondering if - from the perspective of the
> "vmafmotion" filter alone, it wouldn't be simpler to just have a single
> filterpad input ("reference") and apply the metadata on it (when used by
> itself). (The "vmaf" filter could still apply metadata on the "main").
> 
> What do people think? Would you prefer the "vmaf" and "vmafmotion" to
> consistently apply the metadata on the "main" video frames, or would you
> prefer that the "vmafmotion" filter more accurately presents which frame is
> used for the motion scoring, which also happens to lead to simpler code /
> filterchains?

So when I understand this correctly the "vmafmotion" filter would work 
similar (in structure) to the "idet" filter, comparing consecutive 
frames on a single filterpad input and attaching metadata to the output 
frames? Yes, that would make using the "vmafmotion" filter more simple 
and flexible, IMHO.

Regards,
Tobias

Patch hide | download patch | download mbox

diff --git a/Changelog b/Changelog
index cae5254..883e08f 100644
--- a/Changelog
+++ b/Changelog
@@ -43,6 +43,7 @@  version <next>:
 - add --disable-autodetect build switch
 - drop deprecated qtkit input device (use avfoundation instead)
 - despill video filter
+- vmafmotion video filter
 
 version 3.3:
 - CrystalHD decoder moved to new decode API
diff --git a/doc/filters.texi b/doc/filters.texi
index 7790367..9a348b6 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15488,6 +15488,22 @@  vignette='PI/4+random(1)*PI/50':eval=frame
 
 @end itemize
 
+@section vmafmotion
+
+Obtain the average vmaf motion score between two input videos.
+It is one of the component filters of VMAF.
+
+This filter takes two input videos.
+
+The obtained average motion score is printed through the logging system.
+
+In the below example the input file @file{main.mpg} being processed is compared
+with the reference file @file{ref.mpg}.
+
+@example
+ffmpeg -i main.mpg -i ref.mpg -lavfi vmafmotion -f null -
+@end example
+
 @section vstack
 Stack input videos vertically.
 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 1e460ab..de46649 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -328,6 +328,7 @@  OBJS-$(CONFIG_VFLIP_FILTER)                  += vf_vflip.o
 OBJS-$(CONFIG_VIDSTABDETECT_FILTER)          += vidstabutils.o vf_vidstabdetect.o
 OBJS-$(CONFIG_VIDSTABTRANSFORM_FILTER)       += vidstabutils.o vf_vidstabtransform.o
 OBJS-$(CONFIG_VIGNETTE_FILTER)               += vf_vignette.o
+OBJS-$(CONFIG_VMAFMOTION_FILTER)             += vf_vmafmotion.o framesync2.o
 OBJS-$(CONFIG_VSTACK_FILTER)                 += vf_stack.o framesync2.o
 OBJS-$(CONFIG_W3FDIF_FILTER)                 += vf_w3fdif.o
 OBJS-$(CONFIG_WAVEFORM_FILTER)               += vf_waveform.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 9a2cfea..31f1971 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -339,6 +339,7 @@  static void register_all(void)
     REGISTER_FILTER(VIDSTABDETECT,  vidstabdetect,  vf);
     REGISTER_FILTER(VIDSTABTRANSFORM, vidstabtransform, vf);
     REGISTER_FILTER(VIGNETTE,       vignette,       vf);
+    REGISTER_FILTER(VMAFMOTION,     vmafmotion,     vf);
     REGISTER_FILTER(VSTACK,         vstack,         vf);
     REGISTER_FILTER(W3FDIF,         w3fdif,         vf);
     REGISTER_FILTER(WAVEFORM,       waveform,       vf);
diff --git a/libavfilter/vf_vmafmotion.c b/libavfilter/vf_vmafmotion.c
new file mode 100644
index 0000000..873d00f
--- /dev/null
+++ b/libavfilter/vf_vmafmotion.c
@@ -0,0 +1,413 @@ 
+/*
+ * Copyright (c) 2017 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2017 Ashish Pratap Singh <ashk43712@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Calculate VMAF Motion score between two input videos.
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+#include "drawutils.h"
+#include "formats.h"
+#include "framesync2.h"
+#include "internal.h"
+#include "vmaf_motion.h"
+#include "video.h"
+
+typedef struct VMAFMotionContext {
+    const AVClass *class;
+    FFFrameSync fs;
+    const AVPixFmtDescriptor *desc;
+    int filter[5];
+    int width;
+    int height;
+    uint16_t *prev_blur_data;
+    uint16_t *blur_data;
+    uint16_t *temp_data;
+    double motion_sum;
+    uint64_t nb_frames;
+} VMAFMotionContext;
+
+#define MAX_ALIGN 32
+#define ALIGN_CEIL(x) ((x) + ((x) % MAX_ALIGN ? MAX_ALIGN - (x) % MAX_ALIGN : 0))
+
+static const AVOption vmafmotion_options[] = {
+    { NULL }
+};
+
+FRAMESYNC_DEFINE_CLASS(vmafmotion, VMAFMotionContext, fs);
+
+static uint64_t image_sad(const uint16_t *img1, const uint16_t *img2, int w, int h,
+                        ptrdiff_t img1_stride, ptrdiff_t img2_stride)
+{
+    uint64_t sum = 0;
+    int i, j;
+
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j++) {
+            sum += abs(img1[j] - img2[j]);
+        }
+        img1 += img1_stride;
+        img2 += img2_stride;
+    }
+
+    return sum;
+}
+
+static inline int floorn(int n, int m)
+{
+    return n - n % m;
+}
+
+static inline int ceiln(int n, int m)
+{
+    return n % m ? n + (m - n % m) : n;
+}
+
+static void convolution_x(const int *filter, int filt_w, const uint16_t *src,
+                          uint16_t *dst, int w, int h, ptrdiff_t src_stride,
+                          ptrdiff_t dst_stride)
+{
+    int radius = filt_w / 2;
+    int borders_left = ceiln(radius, 1);
+    int borders_right = floorn(w - (filt_w - radius), 1);
+    int i, j, k;
+    int sum = 0;
+
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < borders_left; j++) {
+            sum = 0;
+            for (k = 0; k < filt_w; k++) {
+                int j_tap = FFABS(j - radius + k);
+                if (j_tap >= w) {
+                    j_tap = w - (j_tap - w + 1);
+                }
+                sum += filter[k] * src[i * src_stride + j_tap];
+            }
+            dst[i * dst_stride + j] = sum >> N;
+        }
+
+        for (j = borders_left; j < borders_right; j++) {
+            int sum = 0;
+            for (k = 0; k < filt_w; k++) {
+                sum += filter[k] * src[i * src_stride + j - radius + k];
+            }
+            dst[i * dst_stride + j] = sum >> N;
+        }
+
+        for (j = borders_right; j < w; j++) {
+            sum = 0;
+            for (k = 0; k < filt_w; k++) {
+                int j_tap = FFABS(j - radius + k);
+                if (j_tap >= w) {
+                    j_tap = w - (j_tap - w + 1);
+                }
+                sum += filter[k] * src[i * src_stride + j_tap];
+            }
+            dst[i * dst_stride + j] = sum >> N;
+        }
+    }
+}
+
+#define conv_y_fn(type, bits) \
+    static void convolution_y_##bits##bit(const int *filter, int filt_w, \
+                                          const type *src, uint16_t *dst, \
+                                          int w, int h, ptrdiff_t src_stride, \
+                                          ptrdiff_t dst_stride) \
+{ \
+    int radius = filt_w / 2; \
+    int borders_top = ceiln(radius, 1); \
+    int borders_bottom = floorn(h - (filt_w - radius), 1); \
+    int i, j, k; \
+    int sum = 0; \
+    \
+    for (i = 0; i < borders_top; i++) { \
+        for (j = 0; j < w; j++) { \
+            sum = 0; \
+            for (k = 0; k < filt_w; k++) { \
+                int i_tap = FFABS(i - radius + k); \
+                if (i_tap >= h) { \
+                    i_tap = h - (i_tap - h + 1); \
+                } \
+                sum += filter[k] * src[i_tap * src_stride + j]; \
+            } \
+            dst[i * dst_stride + j] = sum >> N; \
+        } \
+    } \
+    for (i = borders_top; i < borders_bottom; i++) { \
+        for (j = 0; j < w; j++) { \
+            sum = 0; \
+            for (k = 0; k < filt_w; k++) { \
+                sum += filter[k] * src[(i - radius + k) * src_stride + j]; \
+            } \
+            dst[i * dst_stride + j] = sum >> N; \
+        } \
+    } \
+    for (i = borders_bottom; i < h; i++) { \
+        for (j = 0; j < w; j++) { \
+            sum = 0; \
+            for (k = 0; k < filt_w; k++) { \
+                int i_tap = FFABS(i - radius + k); \
+                if (i_tap >= h) { \
+                    i_tap = h - (i_tap - h + 1); \
+                } \
+                sum += filter[k] * src[i_tap * src_stride + j]; \
+            } \
+            dst[i * dst_stride + j] = sum >> N; \
+        } \
+    } \
+}
+
+conv_y_fn(uint8_t, 8);
+conv_y_fn(uint16_t, 10);
+
+void convolution_f32(const int *filter, int filt_w, const void *src,
+                     uint16_t *dst, uint16_t *tmp, int w, int h,
+                     ptrdiff_t src_stride, ptrdiff_t dst_stride, uint8_t type)
+{
+    if(type == 8) {
+        convolution_y_8bit(filter, filt_w, (const uint8_t *) src, tmp, w, h,
+                           src_stride, dst_stride);
+    } else {
+        convolution_y_10bit(filter, filt_w, (const uint16_t *) src, tmp, w, h,
+                            src_stride, dst_stride);
+    }
+
+    convolution_x(filter, filt_w, tmp, dst, w, h, dst_stride, dst_stride);
+}
+
+int compute_vmafmotion(const uint16_t *ref, const uint16_t *main, int w, int h,
+                       ptrdiff_t ref_stride, ptrdiff_t main_stride, double *score)
+{
+    uint64_t sad = image_sad(ref, main, w, h, ref_stride / sizeof(uint16_t),
+                       main_stride / sizeof(uint16_t));
+    *score = (double) (sad * 1.0 / (w * h));
+
+    return 0;
+}
+
+static void set_meta(AVDictionary **metadata, const char *key, float d)
+{
+    char value[128];
+    snprintf(value, sizeof(value), "%0.2f", d);
+    av_dict_set(metadata, key, value, 0);
+}
+
+static int do_vmafmotion(FFFrameSync *fs)
+{
+    AVFilterContext *ctx = fs->parent;
+    VMAFMotionContext *s = ctx->priv;
+    AVFrame *main, *ref;
+    AVDictionary **metadata;
+    int ret;
+    ptrdiff_t ref_stride;
+    ptrdiff_t ref_px_stride;
+    ptrdiff_t stride;
+    ptrdiff_t px_stride;
+    size_t data_sz;
+    double score;
+
+    ret = ff_framesync2_dualinput_get(fs, &main, &ref);
+    if (ret < 0)
+        return ret;
+    if (!ref)
+        return ff_filter_frame(ctx->outputs[0], main);
+
+    metadata = &main->metadata;
+
+    ref_stride = ref->linesize[0];
+    stride = ALIGN_CEIL(s->width * sizeof(uint16_t));
+    data_sz = (size_t)stride * s->height;
+    px_stride = stride / sizeof(uint16_t);
+
+    if (s->desc->comp[0].depth <= 8) {
+        ref_px_stride = ref_stride / sizeof(uint8_t);
+        convolution_f32(s->filter, 5, (const uint8_t *) ref->data[0],
+                        s->blur_data, s->temp_data, s->width, s->height,
+                        ref_px_stride, px_stride, 8);
+    } else {
+        ref_px_stride = ref_stride / sizeof(uint16_t);
+        convolution_f32(s->filter, 5, (const uint16_t *) ref->data[0],
+                        s->blur_data, s->temp_data, s->width, s->height,
+                        ref_px_stride, px_stride, 10);
+    }
+
+    if(!s->nb_frames) {
+        score = 0.0;
+    } else {
+        compute_vmafmotion(s->prev_blur_data, s->blur_data, s->width, s->height,
+                           stride, stride, &score);
+    }
+
+    memcpy(s->prev_blur_data, s->blur_data, data_sz);
+
+    set_meta(metadata, "lavfi.vmafmotion.score", score);
+
+    s->nb_frames++;
+
+    s->motion_sum += score;
+
+    return ff_filter_frame(ctx->outputs[0], main);
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+    VMAFMotionContext *s = ctx->priv;
+
+    int i;
+    for(i = 0; i < 5; i++) {
+        s->filter[i] = lrint(FILTER_5[i] * (1 << N));
+    }
+
+    s->fs.on_event = do_vmafmotion;
+
+    return 0;
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_YUV444P10LE, AV_PIX_FMT_YUV422P10LE, AV_PIX_FMT_YUV420P10LE,
+        AV_PIX_FMT_NONE
+    };
+
+    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+static int config_input_ref(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx  = inlink->dst;
+    VMAFMotionContext *s = ctx->priv;
+    ptrdiff_t stride;
+    size_t data_sz;
+
+    if (ctx->inputs[0]->w != ctx->inputs[1]->w ||
+        ctx->inputs[0]->h != ctx->inputs[1]->h) {
+        av_log(ctx, AV_LOG_ERROR, "Width and height of input videos must be same.\n");
+        return AVERROR(EINVAL);
+    }
+    if (ctx->inputs[0]->format != ctx->inputs[1]->format) {
+        av_log(ctx, AV_LOG_ERROR, "Inputs must be of same pixel format.\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->desc = av_pix_fmt_desc_get(inlink->format);
+    s->width = ctx->inputs[0]->w;
+    s->height = ctx->inputs[0]->h;
+
+    stride = ALIGN_CEIL(s->width * sizeof(uint16_t));
+    data_sz = (size_t)stride * s->height;
+
+    if (!(s->prev_blur_data = av_malloc(data_sz))) {
+        return AVERROR(ENOMEM);
+    }
+    if (!(s->blur_data = av_malloc(data_sz))) {
+        return AVERROR(ENOMEM);
+    }
+    if (!(s->temp_data = av_malloc(data_sz))) {
+        return AVERROR(ENOMEM);
+    }
+
+    return 0;
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    VMAFMotionContext *s = ctx->priv;
+    AVFilterLink *mainlink = ctx->inputs[0];
+    int ret;
+
+    ret = ff_framesync2_init_dualinput(&s->fs, ctx);
+    if (ret < 0)
+        return ret;
+    outlink->w = mainlink->w;
+    outlink->h = mainlink->h;
+    outlink->time_base = mainlink->time_base;
+    outlink->sample_aspect_ratio = mainlink->sample_aspect_ratio;
+    outlink->frame_rate = mainlink->frame_rate;
+    if ((ret = ff_framesync2_configure(&s->fs)) < 0)
+        return ret;
+    return 0;
+}
+
+static int activate(AVFilterContext *ctx)
+{
+    VMAFMotionContext *s = ctx->priv;
+    return ff_framesync2_activate(&s->fs);
+}
+
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    VMAFMotionContext *s = ctx->priv;
+
+    ff_framesync2_uninit(&s->fs);
+
+    if (s->nb_frames > 0) {
+        av_log(ctx, AV_LOG_INFO, "VMAF Motion avg: %.3f\n", s->motion_sum / s->nb_frames);
+    }
+
+    av_free(s->prev_blur_data);
+    av_free(s->blur_data);
+    av_free(s->temp_data);
+}
+
+static const AVFilterPad vmafmotion_inputs[] = {
+    {
+        .name         = "main",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },{
+        .name         = "reference",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_input_ref,
+    },
+    { NULL }
+};
+
+static const AVFilterPad vmafmotion_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_vmafmotion = {
+    .name          = "vmafmotion",
+    .description   = NULL_IF_CONFIG_SMALL("Calculate the VMAF Motion score between two video streams."),
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .activate      = activate,
+    .priv_size     = sizeof(VMAFMotionContext),
+    .priv_class    = &vmafmotion_class,
+    .inputs        = vmafmotion_inputs,
+    .outputs       = vmafmotion_outputs,
+};
diff --git a/libavfilter/vmaf_motion.h b/libavfilter/vmaf_motion.h
new file mode 100644
index 0000000..eb41636
--- /dev/null
+++ b/libavfilter/vmaf_motion.h
@@ -0,0 +1,42 @@ 
+/*
+ * Copyright (c) 2017 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2017 Ashish Pratap Singh <ashk43712@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef MOTION_TOOLS_H_
+#define MOTION_TOOLS_H_
+
+#define N 15
+
+static const float FILTER_5[5] = {
+    0.054488685,
+    0.244201342,
+    0.402619947,
+    0.244201342,
+    0.054488685
+};
+
+void convolution_f32(const int *filter, int filt_width, const void *src,
+                     uint16_t *dst, uint16_t *tmp, int w, int h,
+                     ptrdiff_t src_stride, ptrdiff_t dst_stride, uint8_t type);
+
+int compute_vmafmotion(const uint16_t *ref, const uint16_t *main, int w, int h,
+                       ptrdiff_t ref_stride, ptrdiff_t main_stride, double *score);
+
+#endif /* MOTION_TOOLS_H_ */