[FFmpeg-devel,RFC] avfilter/fastdeint: import simple cpu-optimized deinterlacing algorithms from VLC

Submitted by Aman Gupta on Sept. 9, 2019, 8:12 p.m.

Details

Message ID 20190909201220.13070-1-ffmpeg@tmm1.net
State New
Headers show

Commit Message

Aman Gupta Sept. 9, 2019, 8:12 p.m.
From: Aman Gupta <aman@tmm1.net>

These are simple algorithms which can be run efficiently
on low powered devices to produce deinteraced images.

Signed-off-by: Aman Gupta <aman@tmm1.net>
---
 doc/filters.texi                 |  27 ++
 libavfilter/Makefile             |   1 +
 libavfilter/aarch64/Makefile     |   1 +
 libavfilter/aarch64/merge_neon.S |  98 ++++++
 libavfilter/allfilters.c         |   1 +
 libavfilter/arm/Makefile         |   3 +
 libavfilter/arm/merge_armv6.S    |  70 ++++
 libavfilter/arm/merge_neon.S     | 109 ++++++
 libavfilter/vf_fastdeint.c       | 588 +++++++++++++++++++++++++++++++
 9 files changed, 898 insertions(+)
 create mode 100644 libavfilter/aarch64/merge_neon.S
 create mode 100644 libavfilter/arm/Makefile
 create mode 100644 libavfilter/arm/merge_armv6.S
 create mode 100644 libavfilter/arm/merge_neon.S
 create mode 100644 libavfilter/vf_fastdeint.c

Comments

Paul B Mahol Sept. 9, 2019, 8:38 p.m.
On 9/9/19, Aman Gupta <ffmpeg@tmm1.net> wrote:
> From: Aman Gupta <aman@tmm1.net>
>
> These are simple algorithms which can be run efficiently
> on low powered devices to produce deinteraced images.
>
> Signed-off-by: Aman Gupta <aman@tmm1.net>
> ---
>  doc/filters.texi                 |  27 ++
>  libavfilter/Makefile             |   1 +
>  libavfilter/aarch64/Makefile     |   1 +
>  libavfilter/aarch64/merge_neon.S |  98 ++++++
>  libavfilter/allfilters.c         |   1 +
>  libavfilter/arm/Makefile         |   3 +
>  libavfilter/arm/merge_armv6.S    |  70 ++++
>  libavfilter/arm/merge_neon.S     | 109 ++++++
>  libavfilter/vf_fastdeint.c       | 588 +++++++++++++++++++++++++++++++
>  9 files changed, 898 insertions(+)
>  create mode 100644 libavfilter/aarch64/merge_neon.S
>  create mode 100644 libavfilter/arm/Makefile
>  create mode 100644 libavfilter/arm/merge_armv6.S
>  create mode 100644 libavfilter/arm/merge_neon.S
>  create mode 100644 libavfilter/vf_fastdeint.c
>
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 6c81e1da40..55d9adeb81 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -9796,6 +9796,33 @@ fade=t=in:st=5.5:d=0.5
>
>  @end itemize
>
> +@section fastdeint
> +Fast deinterlacing algorithms.
> +
> +@table @option
> +@item mode
> +Deinterlacing algorithm to use.
> +
> +It accepts the following values:
> +@table @samp
> +@item discard
> +Discard bottom frame.
> +
> +@item mean
> +Half resolution blender.
> +
> +@item blend
> +Full resolution blender.
> +
> +@item bob
> +Bob doubler.
> +
> +@item linear
> +Bob doubler with linear interpolation.
> +@end table
> +
> +@end table
> +
>  @section fftdnoiz
>  Denoise frames using 3D FFT (frequency domain filtering).
>
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 3ef4191d9a..a2b3566ec0 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -234,6 +234,7 @@ OBJS-$(CONFIG_EROSION_OPENCL_FILTER)         +=
> vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_EXTRACTPLANES_FILTER)          += vf_extractplanes.o
>  OBJS-$(CONFIG_FADE_FILTER)                   += vf_fade.o
> +OBJS-$(CONFIG_FASTDEINT_FILTER)              += vf_fastdeint.o
>  OBJS-$(CONFIG_FFTDNOIZ_FILTER)               += vf_fftdnoiz.o
>  OBJS-$(CONFIG_FFTFILT_FILTER)                += vf_fftfilt.o
>  OBJS-$(CONFIG_FIELD_FILTER)                  += vf_field.o
> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
> index b58daa3a3f..2b0ad92893 100644
> --- a/libavfilter/aarch64/Makefile
> +++ b/libavfilter/aarch64/Makefile
> @@ -1,3 +1,4 @@
>  OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
>
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)         += aarch64/merge_neon.o
>  NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
> diff --git a/libavfilter/aarch64/merge_neon.S
> b/libavfilter/aarch64/merge_neon.S
> new file mode 100644
> index 0000000000..62377331a4
> --- /dev/null
> +++ b/libavfilter/aarch64/merge_neon.S
> @@ -0,0 +1,98 @@
> +/*
> + * Copyright (c) 2009-2016 Rémi Denis-Courmont, Janne Grunau, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +#define dest x0
> +#define src1 x1
> +#define src2 x2
> +#define size x3
> +
> +        .align 2
> +        // NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +        mov             x10, #64
> +        add             x11, src1, #32
> +        add             x12, src2, #32
> +1:
> +        ld1             {v0.16b,v1.16b}, [src1], x10
> +        ld1             {v4.16b,v5.16b}, [src2], x10
> +        ld1             {v2.16b,v3.16b}, [x11], x10
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        ld1             {v6.16b,v7.16b}, [x12], x10
> +        subs            x5, x5, #64
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        uhadd           v2.16b, v2.16b, v6.16b
> +        uhadd           v3.16b, v3.16b, v7.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +        st1             {v2.16b,v3.16b}, [dest], #32
> +        b.gt            1b
> +2:
> +        tbz             size, #5,  3f
> +        ld1             {v0.16b,v1.16b}, [src1], #32
> +        ld1             {v4.16b,v5.16b}, [src2], #32
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +3:
> +        tbz             size, #4, 4f
> +        ld1             {v0.16b}, [src1]
> +        ld1             {v4.16b}, [src2]
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        st1             {v0.16b}, [dest]
> +4:
> +        ret
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +1:
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        ld1             {v2.8h,v3.8h}, [src1], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        ld1             {v6.8h,v7.8h}, [src2], #32
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        uhadd           v2.8h, v2.8h, v6.8h
> +        uhadd           v3.8h, v3.8h, v7.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +        st1             {v2.8h,v3.8h}, [dest], #32
> +        subs            x5, x5, #64
> +        b.gt            1b
> +2:
> +        tbz             size, #5, 3f
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +3:
> +        tbz             size, #4,  4f
> +        ld1             {v0.8h}, [src1]
> +        ld1             {v4.8h}, [src2]
> +        uhadd           v0.8h, v0.8h,v4.8h
> +        st1             {v0.8h}, [dest]
> +4:
> +        ret
> +endfunc
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index b675c688ee..6631af2ffe 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -219,6 +219,7 @@ extern AVFilter ff_vf_erosion;
>  extern AVFilter ff_vf_erosion_opencl;
>  extern AVFilter ff_vf_extractplanes;
>  extern AVFilter ff_vf_fade;
> +extern AVFilter ff_vf_fastdeint;
>  extern AVFilter ff_vf_fftdnoiz;
>  extern AVFilter ff_vf_fftfilt;
>  extern AVFilter ff_vf_field;
> diff --git a/libavfilter/arm/Makefile b/libavfilter/arm/Makefile
> new file mode 100644
> index 0000000000..c92d62fac9
> --- /dev/null
> +++ b/libavfilter/arm/Makefile
> @@ -0,0 +1,3 @@
> +ARMV6-OBJS-$(CONFIG_FASTDEINT_FILTER)  += arm/merge_armv6.o
> +
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)   += arm/merge_neon.o
> diff --git a/libavfilter/arm/merge_armv6.S b/libavfilter/arm/merge_armv6.S
> new file mode 100644
> index 0000000000..9b551c2c6c
> --- /dev/null
> +++ b/libavfilter/arm/merge_armv6.S
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +function ff_merge8_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd8          r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd8          r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd8          r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd8          r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> +
> +        .align 2
> +function ff_merge16_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd16         r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd16         r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd16         r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd16         r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> \ No newline at end of file
> diff --git a/libavfilter/arm/merge_neon.S b/libavfilter/arm/merge_neon.S
> new file mode 100644
> index 0000000000..ae36cf3ca9
> --- /dev/null
> +++ b/libavfilter/arm/merge_neon.S
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +        @ NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u8         {q2-q3}, [src1,:128]!
> +        vhadd.u8        q1, q1, q9
> +        vld1.u8         {q10-q11}, [src2,:128]!
> +        vhadd.u8        q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u8        q3, q3, q11
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +        vst1.u8         {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vhadd.u8        q1, q1, q9
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u8         {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u8         {q8}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vst1.u8         {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u16        {q2-q3}, [src1,:128]!
> +        vhadd.u16       q1, q1, q9
> +        vld1.u16        {q10-q11}, [src2,:128]!
> +        vhadd.u16       q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u16       q3, q3, q11
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +        vst1.u16        {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vhadd.u16       q1, q1, q9
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u16        {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u16        {q8}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vst1.u16        {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> \ No newline at end of file
> diff --git a/libavfilter/vf_fastdeint.c b/libavfilter/vf_fastdeint.c
> new file mode 100644
> index 0000000000..5ddd8be392
> --- /dev/null
> +++ b/libavfilter/vf_fastdeint.c
> @@ -0,0 +1,588 @@
> +/*
> + * Copyright (C) 2015 Aman Gupta <aman@tmm1.net>
> + *               2000-2011 VLC authors and VideoLAN
> + *
> + * Author: Sam Hocevar <sam@zoy.org>
> + *         Damien Lucas <nitrox@videolan.org>
> + *         Laurent Aimar <fenrir@videolan.org>
> + *         Sigmund Augdal Helberg <sigmunau@videolan.org>
> + *
> + * These algorithms are derived from the VLC project's
> + * modules/video_filter/deinterlace/algo_basic.c
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/common.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/timestamp.h"
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +enum Mode {
> +  MODE_DISCARD,
> +  MODE_MEAN,
> +  MODE_BLEND,
> +  MODE_BOB,
> +  MODE_LINEAR,
> +  MODE_MAX,
> +};
> +
> +typedef void (*merge_fn)(void *dst, const void *src1, const void *src2,
> size_t len);
> +
> +typedef struct FastDeintContext {
> +    const AVClass *class;
> +    merge_fn merge;
> +    int merge_size;
> +    int merge_aligned;
> +    AVFrame *cur, *next;
> +    enum Mode mode;
> +    int eof;
> +} FastDeintContext;
> +
> +static void merge8_c(uint8_t *dst, const uint8_t *src1, const uint8_t
> *src2, size_t bytes)
> +{
> +    for (; bytes > 0; bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge16_c(uint16_t *dst, const uint16_t *src1, const uint16_t
> *src2, size_t bytes)
> +{
> +    for (size_t words = bytes / 2; words > 0; words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge8_unaligned(FastDeintContext *s, uint8_t *dst, const
> uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t remainder = bytes % 16;
> +        if (remainder > 0) {
> +            merge8_c(dst, src1, src2, remainder);
> +            bytes -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge16_unaligned(FastDeintContext *s, uint16_t *dst, const
> uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t words = bytes / 2;
> +        size_t remainder = words % 8;
> +        if (remainder > 0) {
> +            merge16_c(dst, src1, src2, remainder);
> +            words -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge_unaligned(FastDeintContext *s, void *dst, const void
> *src1, const void *src2, size_t bytes)
> +{
> +    if (s->merge_size == 16)
> +        merge16_unaligned(s, dst, src1, src2, bytes);
> +    else
> +        merge8_unaligned(s, dst, src1, src2, bytes);
> +}
> +
> +#if HAVE_SSE2_INLINE && defined(__x86_64__)
> +static void merge8_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t
> *src2, size_t bytes)
> +{
> +    for(; bytes > 0 && ((uintptr_t)src1 & 15); bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; bytes >= 16; bytes -= 16) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgb %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 16;
> +        src1 += 16;
> +        src2 += 16;
> +    }
> +
> +    if (bytes > 0) {
> +        merge8_c(dst, src1, src2, bytes);
> +    }
> +}
> +static void merge16_sse2(uint16_t *dst, const uint16_t *src1, const
> uint16_t *src2, size_t bytes)
> +{
> +    size_t words = bytes / 2;
> +
> +    for(; words > 0 && ((uintptr_t)src1 & 15); words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; words >= 8; words -= 8) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgw %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 8;
> +        src1 += 8;
> +        src2 += 8;
> +    }
> +

Unacceptable code. Inline assembly is forbidden.

> +    if (words > 0) {
> +        merge16_c(dst, src1, src2, words * 2);
> +    }
> +}
> +#define merge8 merge8_sse2
> +#define merge16 merge16_sse2
> +#else
> +#define merge8 merge8_c
> +#define merge16 merge16_c
> +#endif
> +
> +static void render_image_single(FastDeintContext *s, AVFrame *out, AVFrame
> *frame)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        dst_linesize = out->linesize[i];
> +        src_linesize = frame->linesize[i];
> +
> +        if (mode == MODE_BLEND) {
> +            // Copy first line
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            height--;
> +        }
> +
> +        // Merge remaining lines
> +        for (; height > 0; height--) {
> +            if (mode == MODE_DISCARD)
> +                memcpy(dst, src, bwidth);
> +            else
> +                merge_unaligned(s, dst, src, src + src_linesize, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            if (mode == MODE_MEAN || mode == MODE_DISCARD) {
> +                src += src_linesize;
> +                height--;
> +            }
> +        }
> +    }
> +    if (mode != MODE_DISCARD)
> +        emms_c();
> +}
> +
> +static void render_image_doubler(FastDeintContext *s, AVFrame *out, AVFrame
> *frame, int field)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        src_linesize = frame->linesize[i];
> +        dst_linesize = out->linesize[i];
> +
> +        // For BOTTOM field we need to add the first line
> +        if (field == 1) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            height--;
> +        }
> +
> +        height -= 2;
> +
> +        for (; height > 0; height-=2) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            if (mode == MODE_LINEAR)
> +                merge_unaligned(s, dst, src, src + 2 * src_linesize,
> bwidth);
> +            else
> +                memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            src += src_linesize * 2;
> +        }
> +
> +        memcpy(dst, src, bwidth);
> +
> +        // For TOP field we need to add the last line
> +        if (field == 0)
> +        {
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            memcpy(dst, src, bwidth);
> +        }
> +    }
> +    if (mode == MODE_LINEAR)
> +        emms_c();
> +}
> +
> +static int filter_frame_single(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +    FastDeintContext *s = ctx->priv;
> +
> +    if (!frame->interlaced_frame) {
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    av_frame_copy_props(out, frame);
> +    out->interlaced_frame = 0;
> +    render_image_single(s, out, frame);
> +
> +    av_frame_free(&frame);
> +    return ff_filter_frame(ctx->outputs[0], out);
> +}
> +
> +static AVFrame *copy_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +
> +    if (frame->format == AV_PIX_FMT_VIDEOTOOLBOX)
> +        out = av_frame_alloc();
> +    else
> +        out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +
> +    if (!out)
> +        return NULL;
> +
> +    av_frame_copy_props(out, frame);
> +    return out;
> +}
> +
> +static int filter_frame_double(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +    AVFrame *frame, *out, *out2;
> +    int tff, ret;
> +
> +    s->cur = s->next;
> +    s->next = in;
> +
> +    if (!s->cur) {
> +        return 0;
> +    }
> +
> +    frame = s->cur;
> +
> +    if (!frame->interlaced_frame) {
> +        if (frame->pts != AV_NOPTS_VALUE)
> +            frame->pts *= 2;
> +        s->cur = NULL;
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    tff = frame->top_field_first;
> +    out = copy_frame(link, frame);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out->interlaced_frame = 0;
> +    if (out->pts != AV_NOPTS_VALUE)
> +        out->pts = out->pts * 2;
> +    render_image_doubler(s, out, frame, !tff);
> +
> +    ret = ff_filter_frame(ctx->outputs[0], out);
> +    if (ret < 0) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return ret;
> +    }
> +
> +    out2 = copy_frame(link, frame);
> +    if (!out2) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out2->interlaced_frame = 0;
> +    av_frame_remove_side_data(out2, AV_FRAME_DATA_A53_CC);
> +    if (out2->pts != AV_NOPTS_VALUE) {
> +        out2->pts = frame->pts + s->next->pts;
> +    }
> +    render_image_doubler(s, out2, frame, tff);
> +
> +    av_frame_free(&frame);
> +    s->cur = NULL;
> +
> +    return ff_filter_frame(ctx->outputs[0], out2);
> +}
> +
> +static int filter_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +
> +    av_assert0(frame);
> +
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        return filter_frame_double(link, frame);
> +    } else {
> +        return filter_frame_single(link, frame);
> +    }
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    FastDeintContext *s = ctx->priv;
> +    av_frame_free(&s->cur);
> +    av_frame_free(&s->next);
> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_YUV420P,
> +        AV_PIX_FMT_YUV422P,
> +        AV_PIX_FMT_YUV444P,
> +        AV_PIX_FMT_YUV410P,
> +        AV_PIX_FMT_YUV411P,
> +        AV_PIX_FMT_GRAY8,
> +        AV_PIX_FMT_YUVJ420P,
> +        AV_PIX_FMT_YUVJ422P,
> +        AV_PIX_FMT_YUVJ444P,
> +        AV_PIX_FMT_GRAY16,
> +        AV_PIX_FMT_YUV440P,
> +        AV_PIX_FMT_YUVJ440P,
> +        AV_PIX_FMT_YUV420P9,
> +        AV_PIX_FMT_YUV422P9,
> +        AV_PIX_FMT_YUV444P9,
> +        AV_PIX_FMT_YUV420P10,
> +        AV_PIX_FMT_YUV422P10,
> +        AV_PIX_FMT_YUV444P10,
> +        AV_PIX_FMT_YUV420P12,
> +        AV_PIX_FMT_YUV422P12,
> +        AV_PIX_FMT_YUV444P12,
> +        AV_PIX_FMT_YUV420P14,
> +        AV_PIX_FMT_YUV422P14,
> +        AV_PIX_FMT_YUV444P14,
> +        AV_PIX_FMT_YUV420P16,
> +        AV_PIX_FMT_YUV422P16,
> +        AV_PIX_FMT_YUV444P16,
> +        AV_PIX_FMT_YUVA420P,
> +        AV_PIX_FMT_YUVA422P,
> +        AV_PIX_FMT_YUVA444P,
> +        AV_PIX_FMT_GBRP,
> +        AV_PIX_FMT_GBRP9,
> +        AV_PIX_FMT_GBRP10,
> +        AV_PIX_FMT_GBRP12,
> +        AV_PIX_FMT_GBRP14,
> +        AV_PIX_FMT_GBRP16,
> +        AV_PIX_FMT_GBRAP,
> +        AV_PIX_FMT_NONE
> +    };

Group this ones on less lines somehow.

> +
> +    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
> +    if (!fmts_list)
> +        return AVERROR(ENOMEM);
> +    return ff_set_common_formats(ctx, fmts_list);
> +}
> +
> +#if ARCH_ARM
> +#include "libavutil/arm/cpu.h"
> +#endif
> +#if ARCH_AARCH64
> +#include "libavutil/aarch64/cpu.h"
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +void ff_merge8_neon(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
> size_t bytes);
> +void ff_merge16_neon(uint16_t *dst, const uint16_t *src1, const uint16_t
> *src2, size_t bytes);
> +void ff_merge8_armv6(uint8_t *dst, const uint8_t *src1, const uint8_t
> *src2, size_t bytes);
> +void ff_merge16_armv6(uint16_t *dst, const uint16_t *src1, const uint16_t
> *src2, size_t bytes);
> +#endif
> +


I do not like this style. Look what other filters do, like one that
adds x86 SIMD.

> +static int config_props(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    const AVPixFmtDescriptor *pix;
> +#if ARCH_AARCH64 || ARCH_ARM
> +    int cpu_flags = av_get_cpu_flags();
> +#endif

This belongs in separate directory and file. See aarch64 directory

> +
> +    link->w = link->src->inputs[0]->w;
> +    link->h = link->src->inputs[0]->h;
> +    link->time_base  = link->src->inputs[0]->time_base;
> +    link->frame_rate = link->src->inputs[0]->frame_rate;
> +    link->sample_aspect_ratio = link->src->inputs[0]->sample_aspect_ratio;
> +
> +    if (s->mode == MODE_MEAN || s->mode == MODE_DISCARD) {
> +        link->h /= 2;
> +        link->sample_aspect_ratio = av_mul_q(link->sample_aspect_ratio,
> av_make_q(1, 2));
> +    }
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        link->time_base  = av_mul_q(link->time_base,  av_make_q(1, 2));
> +        link->frame_rate = av_mul_q(link->frame_rate, av_make_q(2, 1));
> +    }
> +
> +    pix = av_pix_fmt_desc_get(link->format);
> +    s->merge_size = (pix->comp[0].depth > 8) ? 16 : 8;
> +    s->merge = s->merge_size == 16 ? (merge_fn)merge16 : (merge_fn)merge8;
> +
> +#if ARCH_ARM
> +    if (have_armv6(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_armv6 :
> (merge_fn)ff_merge8_armv6;
> +        s->merge_aligned = 1;
> +    }
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +    if (have_neon(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_neon :
> (merge_fn)ff_merge8_neon;
> +        s->merge_aligned = 1;
> +    }
> +#endif
> +
> +    return 0;
> +}
> +
> +static int request_frame(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    int ret;
> +
> +    if (s->eof)
> +        return AVERROR_EOF;
> +
> +    ret = ff_request_frame(ctx->inputs[0]);
> +
> +    if (ret == AVERROR_EOF && s->cur) {
> +        AVFrame *next = av_frame_clone(s->next);
> +        if (!next)
> +            return AVERROR(ENOMEM);
> +
> +        next->pts = s->next->pts * 2 - s->cur->pts;
> +        filter_frame(ctx->inputs[0], next);
> +        s->eof = 1;
> +    } else if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +#define OFFSET(x) offsetof(FastDeintContext, x)
> +#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
> +
> +#define CONST(name, help, val, unit) { name, help, 0, AV_OPT_TYPE_CONST,
> {.i64=val}, INT_MIN, INT_MAX, FLAGS, unit }
> +
> +static const AVOption fastdeint_options[] = {
> +    { "mode", "specify the deinterlacing mode", OFFSET(mode),
> AV_OPT_TYPE_INT, {.i64=MODE_BLEND}, 0, MODE_MAX-1, FLAGS, "mode" },
> +    CONST("discard", "discard bottom frame", MODE_DISCARD, "mode"),
> +    CONST("mean", "half resolution blender", MODE_MEAN, "mode"),
> +    CONST("blend", "full resolution blender", MODE_BLEND, "mode"),
> +    CONST("bob", "bob doubler", MODE_BOB, "mode"),
> +    CONST("linear", "bob doubler with linear interpolation", MODE_LINEAR,
> "mode"),
> +
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(fastdeint);
> +
> +static const AVFilterPad fastdeint_inputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame  = filter_frame,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad fastdeint_outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = config_props,
> +        .request_frame = request_frame
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_fastdeint = {
> +    .name          = "fastdeint",
> +    .description   = NULL_IF_CONFIG_SMALL("fast deinterlacing algorithms"),

First letter should be capitalized.

> +    .priv_size     = sizeof(FastDeintContext),
> +    .priv_class    = &fastdeint_class,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .inputs        = fastdeint_inputs,
> +    .outputs       = fastdeint_outputs,
> +};
> --
> 2.20.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
James Almer Sept. 9, 2019, 8:40 p.m.
On 9/9/2019 5:12 PM, Aman Gupta wrote:
> From: Aman Gupta <aman@tmm1.net>
> 
> These are simple algorithms which can be run efficiently
> on low powered devices to produce deinteraced images.
> 
> Signed-off-by: Aman Gupta <aman@tmm1.net>
> ---
>  doc/filters.texi                 |  27 ++
>  libavfilter/Makefile             |   1 +
>  libavfilter/aarch64/Makefile     |   1 +
>  libavfilter/aarch64/merge_neon.S |  98 ++++++
>  libavfilter/allfilters.c         |   1 +
>  libavfilter/arm/Makefile         |   3 +
>  libavfilter/arm/merge_armv6.S    |  70 ++++
>  libavfilter/arm/merge_neon.S     | 109 ++++++
>  libavfilter/vf_fastdeint.c       | 588 +++++++++++++++++++++++++++++++
>  9 files changed, 898 insertions(+)
>  create mode 100644 libavfilter/aarch64/merge_neon.S
>  create mode 100644 libavfilter/arm/Makefile
>  create mode 100644 libavfilter/arm/merge_armv6.S
>  create mode 100644 libavfilter/arm/merge_neon.S
>  create mode 100644 libavfilter/vf_fastdeint.c

Asm stuff should be in a separate entry.

> 
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 6c81e1da40..55d9adeb81 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -9796,6 +9796,33 @@ fade=t=in:st=5.5:d=0.5
>  
>  @end itemize
>  
> +@section fastdeint
> +Fast deinterlacing algorithms.
> +
> +@table @option
> +@item mode
> +Deinterlacing algorithm to use.
> +
> +It accepts the following values:
> +@table @samp
> +@item discard
> +Discard bottom frame.
> +
> +@item mean
> +Half resolution blender.
> +
> +@item blend
> +Full resolution blender.
> +
> +@item bob
> +Bob doubler.
> +
> +@item linear
> +Bob doubler with linear interpolation.
> +@end table
> +
> +@end table
> +
>  @section fftdnoiz
>  Denoise frames using 3D FFT (frequency domain filtering).
>  
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 3ef4191d9a..a2b3566ec0 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -234,6 +234,7 @@ OBJS-$(CONFIG_EROSION_OPENCL_FILTER)         += vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_EXTRACTPLANES_FILTER)          += vf_extractplanes.o
>  OBJS-$(CONFIG_FADE_FILTER)                   += vf_fade.o
> +OBJS-$(CONFIG_FASTDEINT_FILTER)              += vf_fastdeint.o
>  OBJS-$(CONFIG_FFTDNOIZ_FILTER)               += vf_fftdnoiz.o
>  OBJS-$(CONFIG_FFTFILT_FILTER)                += vf_fftfilt.o
>  OBJS-$(CONFIG_FIELD_FILTER)                  += vf_field.o
> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
> index b58daa3a3f..2b0ad92893 100644
> --- a/libavfilter/aarch64/Makefile
> +++ b/libavfilter/aarch64/Makefile
> @@ -1,3 +1,4 @@
>  OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
>  
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)         += aarch64/merge_neon.o
>  NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
> diff --git a/libavfilter/aarch64/merge_neon.S b/libavfilter/aarch64/merge_neon.S
> new file mode 100644
> index 0000000000..62377331a4
> --- /dev/null
> +++ b/libavfilter/aarch64/merge_neon.S
> @@ -0,0 +1,98 @@
> +/*
> + * Copyright (c) 2009-2016 Rémi Denis-Courmont, Janne Grunau, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +#define dest x0
> +#define src1 x1
> +#define src2 x2
> +#define size x3
> +
> +        .align 2
> +        // NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +        mov             x10, #64
> +        add             x11, src1, #32
> +        add             x12, src2, #32
> +1:
> +        ld1             {v0.16b,v1.16b}, [src1], x10
> +        ld1             {v4.16b,v5.16b}, [src2], x10
> +        ld1             {v2.16b,v3.16b}, [x11], x10
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        ld1             {v6.16b,v7.16b}, [x12], x10
> +        subs            x5, x5, #64
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        uhadd           v2.16b, v2.16b, v6.16b
> +        uhadd           v3.16b, v3.16b, v7.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +        st1             {v2.16b,v3.16b}, [dest], #32
> +        b.gt            1b
> +2:
> +        tbz             size, #5,  3f
> +        ld1             {v0.16b,v1.16b}, [src1], #32
> +        ld1             {v4.16b,v5.16b}, [src2], #32
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +3:
> +        tbz             size, #4, 4f
> +        ld1             {v0.16b}, [src1]
> +        ld1             {v4.16b}, [src2]
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        st1             {v0.16b}, [dest]
> +4:
> +        ret
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +1:
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        ld1             {v2.8h,v3.8h}, [src1], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        ld1             {v6.8h,v7.8h}, [src2], #32
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        uhadd           v2.8h, v2.8h, v6.8h
> +        uhadd           v3.8h, v3.8h, v7.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +        st1             {v2.8h,v3.8h}, [dest], #32
> +        subs            x5, x5, #64
> +        b.gt            1b
> +2:
> +        tbz             size, #5, 3f
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +3:
> +        tbz             size, #4,  4f
> +        ld1             {v0.8h}, [src1]
> +        ld1             {v4.8h}, [src2]
> +        uhadd           v0.8h, v0.8h,v4.8h
> +        st1             {v0.8h}, [dest]
> +4:
> +        ret
> +endfunc
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index b675c688ee..6631af2ffe 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -219,6 +219,7 @@ extern AVFilter ff_vf_erosion;
>  extern AVFilter ff_vf_erosion_opencl;
>  extern AVFilter ff_vf_extractplanes;
>  extern AVFilter ff_vf_fade;
> +extern AVFilter ff_vf_fastdeint;
>  extern AVFilter ff_vf_fftdnoiz;
>  extern AVFilter ff_vf_fftfilt;
>  extern AVFilter ff_vf_field;
> diff --git a/libavfilter/arm/Makefile b/libavfilter/arm/Makefile
> new file mode 100644
> index 0000000000..c92d62fac9
> --- /dev/null
> +++ b/libavfilter/arm/Makefile
> @@ -0,0 +1,3 @@
> +ARMV6-OBJS-$(CONFIG_FASTDEINT_FILTER)  += arm/merge_armv6.o
> +
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)   += arm/merge_neon.o
> diff --git a/libavfilter/arm/merge_armv6.S b/libavfilter/arm/merge_armv6.S
> new file mode 100644
> index 0000000000..9b551c2c6c
> --- /dev/null
> +++ b/libavfilter/arm/merge_armv6.S
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +function ff_merge8_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd8          r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd8          r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd8          r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd8          r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> +
> +        .align 2
> +function ff_merge16_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd16         r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd16         r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd16         r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd16         r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> \ No newline at end of file

This shouldn't happen.

> diff --git a/libavfilter/arm/merge_neon.S b/libavfilter/arm/merge_neon.S
> new file mode 100644
> index 0000000000..ae36cf3ca9
> --- /dev/null
> +++ b/libavfilter/arm/merge_neon.S
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +        @ NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u8         {q2-q3}, [src1,:128]!
> +        vhadd.u8        q1, q1, q9
> +        vld1.u8         {q10-q11}, [src2,:128]!
> +        vhadd.u8        q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u8        q3, q3, q11
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +        vst1.u8         {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vhadd.u8        q1, q1, q9
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u8         {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u8         {q8}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vst1.u8         {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u16        {q2-q3}, [src1,:128]!
> +        vhadd.u16       q1, q1, q9
> +        vld1.u16        {q10-q11}, [src2,:128]!
> +        vhadd.u16       q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u16       q3, q3, q11
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +        vst1.u16        {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vhadd.u16       q1, q1, q9
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u16        {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u16        {q8}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vst1.u16        {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> \ No newline at end of file
> diff --git a/libavfilter/vf_fastdeint.c b/libavfilter/vf_fastdeint.c
> new file mode 100644
> index 0000000000..5ddd8be392
> --- /dev/null
> +++ b/libavfilter/vf_fastdeint.c
> @@ -0,0 +1,588 @@
> +/*
> + * Copyright (C) 2015 Aman Gupta <aman@tmm1.net>
> + *               2000-2011 VLC authors and VideoLAN
> + *
> + * Author: Sam Hocevar <sam@zoy.org>
> + *         Damien Lucas <nitrox@videolan.org>
> + *         Laurent Aimar <fenrir@videolan.org>
> + *         Sigmund Augdal Helberg <sigmunau@videolan.org>
> + *
> + * These algorithms are derived from the VLC project's
> + * modules/video_filter/deinterlace/algo_basic.c
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/common.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/timestamp.h"
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +enum Mode {
> +  MODE_DISCARD,
> +  MODE_MEAN,
> +  MODE_BLEND,
> +  MODE_BOB,
> +  MODE_LINEAR,
> +  MODE_MAX,
> +};
> +
> +typedef void (*merge_fn)(void *dst, const void *src1, const void *src2, size_t len);
> +
> +typedef struct FastDeintContext {
> +    const AVClass *class;
> +    merge_fn merge;
> +    int merge_size;
> +    int merge_aligned;
> +    AVFrame *cur, *next;
> +    enum Mode mode;
> +    int eof;
> +} FastDeintContext;
> +
> +static void merge8_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    for (; bytes > 0; bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    for (size_t words = bytes / 2; words > 0; words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge8_unaligned(FastDeintContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t remainder = bytes % 16;
> +        if (remainder > 0) {
> +            merge8_c(dst, src1, src2, remainder);
> +            bytes -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge16_unaligned(FastDeintContext *s, uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t words = bytes / 2;
> +        size_t remainder = words % 8;
> +        if (remainder > 0) {
> +            merge16_c(dst, src1, src2, remainder);
> +            words -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge_unaligned(FastDeintContext *s, void *dst, const void *src1, const void *src2, size_t bytes)
> +{
> +    if (s->merge_size == 16)
> +        merge16_unaligned(s, dst, src1, src2, bytes);
> +    else
> +        merge8_unaligned(s, dst, src1, src2, bytes);
> +}
> +
> +#if HAVE_SSE2_INLINE && defined(__x86_64__)

No inline asm. This code needs to be ported to nasm syntax.

Also, no arch specific code should be present in arch agnostic source
files, beyond calls to init() functions.

> +static void merge8_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    for(; bytes > 0 && ((uintptr_t)src1 & 15); bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; bytes >= 16; bytes -= 16) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgb %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 16;
> +        src1 += 16;
> +        src2 += 16;
> +    }
> +
> +    if (bytes > 0) {
> +        merge8_c(dst, src1, src2, bytes);
> +    }
> +}
> +static void merge16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    size_t words = bytes / 2;
> +
> +    for(; words > 0 && ((uintptr_t)src1 & 15); words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; words >= 8; words -= 8) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgw %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 8;
> +        src1 += 8;
> +        src2 += 8;
> +    }
> +
> +    if (words > 0) {
> +        merge16_c(dst, src1, src2, words * 2);
> +    }
> +}
> +#define merge8 merge8_sse2
> +#define merge16 merge16_sse2
> +#else
> +#define merge8 merge8_c
> +#define merge16 merge16_c
> +#endif
> +
> +static void render_image_single(FastDeintContext *s, AVFrame *out, AVFrame *frame)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        dst_linesize = out->linesize[i];
> +        src_linesize = frame->linesize[i];
> +
> +        if (mode == MODE_BLEND) {
> +            // Copy first line
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            height--;
> +        }
> +
> +        // Merge remaining lines
> +        for (; height > 0; height--) {
> +            if (mode == MODE_DISCARD)
> +                memcpy(dst, src, bwidth);
> +            else
> +                merge_unaligned(s, dst, src, src + src_linesize, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            if (mode == MODE_MEAN || mode == MODE_DISCARD) {
> +                src += src_linesize;
> +                height--;
> +            }
> +        }
> +    }
> +    if (mode != MODE_DISCARD)
> +        emms_c();
> +}
> +
> +static void render_image_doubler(FastDeintContext *s, AVFrame *out, AVFrame *frame, int field)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        src_linesize = frame->linesize[i];
> +        dst_linesize = out->linesize[i];
> +
> +        // For BOTTOM field we need to add the first line
> +        if (field == 1) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            height--;
> +        }
> +
> +        height -= 2;
> +
> +        for (; height > 0; height-=2) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            if (mode == MODE_LINEAR)
> +                merge_unaligned(s, dst, src, src + 2 * src_linesize, bwidth);
> +            else
> +                memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            src += src_linesize * 2;
> +        }
> +
> +        memcpy(dst, src, bwidth);
> +
> +        // For TOP field we need to add the last line
> +        if (field == 0)
> +        {
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            memcpy(dst, src, bwidth);
> +        }
> +    }
> +    if (mode == MODE_LINEAR)
> +        emms_c();
> +}
> +
> +static int filter_frame_single(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +    FastDeintContext *s = ctx->priv;
> +
> +    if (!frame->interlaced_frame) {
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    av_frame_copy_props(out, frame);
> +    out->interlaced_frame = 0;
> +    render_image_single(s, out, frame);
> +
> +    av_frame_free(&frame);
> +    return ff_filter_frame(ctx->outputs[0], out);
> +}
> +
> +static AVFrame *copy_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +
> +    if (frame->format == AV_PIX_FMT_VIDEOTOOLBOX)
> +        out = av_frame_alloc();
> +    else
> +        out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +
> +    if (!out)
> +        return NULL;
> +
> +    av_frame_copy_props(out, frame);
> +    return out;
> +}
> +
> +static int filter_frame_double(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +    AVFrame *frame, *out, *out2;
> +    int tff, ret;
> +
> +    s->cur = s->next;
> +    s->next = in;
> +
> +    if (!s->cur) {
> +        return 0;
> +    }
> +
> +    frame = s->cur;
> +
> +    if (!frame->interlaced_frame) {
> +        if (frame->pts != AV_NOPTS_VALUE)
> +            frame->pts *= 2;
> +        s->cur = NULL;
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    tff = frame->top_field_first;
> +    out = copy_frame(link, frame);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out->interlaced_frame = 0;
> +    if (out->pts != AV_NOPTS_VALUE)
> +        out->pts = out->pts * 2;
> +    render_image_doubler(s, out, frame, !tff);
> +
> +    ret = ff_filter_frame(ctx->outputs[0], out);
> +    if (ret < 0) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return ret;
> +    }
> +
> +    out2 = copy_frame(link, frame);
> +    if (!out2) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out2->interlaced_frame = 0;
> +    av_frame_remove_side_data(out2, AV_FRAME_DATA_A53_CC);
> +    if (out2->pts != AV_NOPTS_VALUE) {
> +        out2->pts = frame->pts + s->next->pts;
> +    }
> +    render_image_doubler(s, out2, frame, tff);
> +
> +    av_frame_free(&frame);
> +    s->cur = NULL;
> +
> +    return ff_filter_frame(ctx->outputs[0], out2);
> +}
> +
> +static int filter_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +
> +    av_assert0(frame);
> +
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        return filter_frame_double(link, frame);
> +    } else {
> +        return filter_frame_single(link, frame);
> +    }
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    FastDeintContext *s = ctx->priv;
> +    av_frame_free(&s->cur);
> +    av_frame_free(&s->next);
> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_YUV420P,
> +        AV_PIX_FMT_YUV422P,
> +        AV_PIX_FMT_YUV444P,
> +        AV_PIX_FMT_YUV410P,
> +        AV_PIX_FMT_YUV411P,
> +        AV_PIX_FMT_GRAY8,
> +        AV_PIX_FMT_YUVJ420P,
> +        AV_PIX_FMT_YUVJ422P,
> +        AV_PIX_FMT_YUVJ444P,
> +        AV_PIX_FMT_GRAY16,
> +        AV_PIX_FMT_YUV440P,
> +        AV_PIX_FMT_YUVJ440P,
> +        AV_PIX_FMT_YUV420P9,
> +        AV_PIX_FMT_YUV422P9,
> +        AV_PIX_FMT_YUV444P9,
> +        AV_PIX_FMT_YUV420P10,
> +        AV_PIX_FMT_YUV422P10,
> +        AV_PIX_FMT_YUV444P10,
> +        AV_PIX_FMT_YUV420P12,
> +        AV_PIX_FMT_YUV422P12,
> +        AV_PIX_FMT_YUV444P12,
> +        AV_PIX_FMT_YUV420P14,
> +        AV_PIX_FMT_YUV422P14,
> +        AV_PIX_FMT_YUV444P14,
> +        AV_PIX_FMT_YUV420P16,
> +        AV_PIX_FMT_YUV422P16,
> +        AV_PIX_FMT_YUV444P16,
> +        AV_PIX_FMT_YUVA420P,
> +        AV_PIX_FMT_YUVA422P,
> +        AV_PIX_FMT_YUVA444P,
> +        AV_PIX_FMT_GBRP,
> +        AV_PIX_FMT_GBRP9,
> +        AV_PIX_FMT_GBRP10,
> +        AV_PIX_FMT_GBRP12,
> +        AV_PIX_FMT_GBRP14,
> +        AV_PIX_FMT_GBRP16,
> +        AV_PIX_FMT_GBRAP,
> +        AV_PIX_FMT_NONE
> +    };
> +
> +    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
> +    if (!fmts_list)
> +        return AVERROR(ENOMEM);
> +    return ff_set_common_formats(ctx, fmts_list);
> +}
> +
> +#if ARCH_ARM
> +#include "libavutil/arm/cpu.h"
> +#endif
> +#if ARCH_AARCH64
> +#include "libavutil/aarch64/cpu.h"
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +void ff_merge8_neon(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes);
> +void ff_merge16_neon(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes);
> +void ff_merge8_armv6(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes);
> +void ff_merge16_armv6(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes);
> +#endif
> +
> +static int config_props(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    const AVPixFmtDescriptor *pix;
> +#if ARCH_AARCH64 || ARCH_ARM
> +    int cpu_flags = av_get_cpu_flags();
> +#endif
> +
> +    link->w = link->src->inputs[0]->w;
> +    link->h = link->src->inputs[0]->h;
> +    link->time_base  = link->src->inputs[0]->time_base;
> +    link->frame_rate = link->src->inputs[0]->frame_rate;
> +    link->sample_aspect_ratio = link->src->inputs[0]->sample_aspect_ratio;
> +
> +    if (s->mode == MODE_MEAN || s->mode == MODE_DISCARD) {
> +        link->h /= 2;
> +        link->sample_aspect_ratio = av_mul_q(link->sample_aspect_ratio, av_make_q(1, 2));
> +    }
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        link->time_base  = av_mul_q(link->time_base,  av_make_q(1, 2));
> +        link->frame_rate = av_mul_q(link->frame_rate, av_make_q(2, 1));
> +    }
> +
> +    pix = av_pix_fmt_desc_get(link->format);
> +    s->merge_size = (pix->comp[0].depth > 8) ? 16 : 8;
> +    s->merge = s->merge_size == 16 ? (merge_fn)merge16 : (merge_fn)merge8;
> +
> +#if ARCH_ARM
> +    if (have_armv6(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_armv6 : (merge_fn)ff_merge8_armv6;
> +        s->merge_aligned = 1;
> +    }
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +    if (have_neon(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_neon : (merge_fn)ff_merge8_neon;
> +        s->merge_aligned = 1;
> +    }
> +#endif


As i mentioned above, this kind of initialization and any function
prototypes should be added to init files in the respective folders.

In here you should only call init() functions which will set the above.
See how other filters do it, like tinterlace.

> +
> +    return 0;
> +}
> +
> +static int request_frame(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    int ret;
> +
> +    if (s->eof)
> +        return AVERROR_EOF;
> +
> +    ret = ff_request_frame(ctx->inputs[0]);
> +
> +    if (ret == AVERROR_EOF && s->cur) {
> +        AVFrame *next = av_frame_clone(s->next);
> +        if (!next)
> +            return AVERROR(ENOMEM);
> +
> +        next->pts = s->next->pts * 2 - s->cur->pts;
> +        filter_frame(ctx->inputs[0], next);
> +        s->eof = 1;
> +    } else if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +#define OFFSET(x) offsetof(FastDeintContext, x)
> +#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
> +
> +#define CONST(name, help, val, unit) { name, help, 0, AV_OPT_TYPE_CONST, {.i64=val}, INT_MIN, INT_MAX, FLAGS, unit }
> +
> +static const AVOption fastdeint_options[] = {
> +    { "mode", "specify the deinterlacing mode", OFFSET(mode), AV_OPT_TYPE_INT, {.i64=MODE_BLEND}, 0, MODE_MAX-1, FLAGS, "mode" },
> +    CONST("discard", "discard bottom frame", MODE_DISCARD, "mode"),
> +    CONST("mean", "half resolution blender", MODE_MEAN, "mode"),
> +    CONST("blend", "full resolution blender", MODE_BLEND, "mode"),
> +    CONST("bob", "bob doubler", MODE_BOB, "mode"),
> +    CONST("linear", "bob doubler with linear interpolation", MODE_LINEAR, "mode"),
> +
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(fastdeint);
> +
> +static const AVFilterPad fastdeint_inputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame  = filter_frame,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad fastdeint_outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = config_props,
> +        .request_frame = request_frame
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_fastdeint = {
> +    .name          = "fastdeint",
> +    .description   = NULL_IF_CONFIG_SMALL("fast deinterlacing algorithms"),
> +    .priv_size     = sizeof(FastDeintContext),
> +    .priv_class    = &fastdeint_class,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .inputs        = fastdeint_inputs,
> +    .outputs       = fastdeint_outputs,
> +};
>
Carl Eugen Hoyos Sept. 9, 2019, 9:41 p.m.
Am Mo., 9. Sept. 2019 um 22:19 Uhr schrieb Aman Gupta <ffmpeg@tmm1.net>:
>
> From: Aman Gupta <aman@tmm1.net>
>
> These are simple algorithms which can be run efficiently
> on low powered devices to produce deinteraced images.

Please provide some numbers about the performance
(and subjective visual quality) of the new C code in
comparison to existing deinterlacers in FFmpeg.

Carl Eugen
Aman Gupta Sept. 9, 2019, 9:54 p.m.
On Mon, Sep 9, 2019 at 2:47 PM Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:

> Am Mo., 9. Sept. 2019 um 22:19 Uhr schrieb Aman Gupta <ffmpeg@tmm1.net>:
> >
> > From: Aman Gupta <aman@tmm1.net>
> >
> > These are simple algorithms which can be run efficiently
> > on low powered devices to produce deinteraced images.
>
> Please provide some numbers about the performance
> (and subjective visual quality) of the new C code in
> comparison to existing deinterlacers in FFmpeg.
>

Comparison of visual quality can be seen on VLC's website:
https://wiki.videolan.org/Deinterlacing

Regarding performance- none of the filters currently available in ffmpeg
are fast enough to deinterlace video in real time on ARM chips used by
popular Android or iOS devices. They're all very computationally expensive,
and do not have any ARM SIMD implementations. The deinterlacers from VLC
use simple mathematical averages optimized by SIMD, and have been used by
VLC on such devices for many years. I don't have any hard numbers to share,
but in my experience I can decode+deinterlace video for real time playback
in VLC on any cheap Android phone, whereas other ffmpeg-based players
cannot.

Aman


> Carl Eugen
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Hendrik Leppkes Sept. 9, 2019, 10:13 p.m.
On Tue, Sep 10, 2019 at 12:00 AM Aman Gupta <ffmpeg@tmm1.net> wrote:
>
> On Mon, Sep 9, 2019 at 2:47 PM Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:
>
> > Am Mo., 9. Sept. 2019 um 22:19 Uhr schrieb Aman Gupta <ffmpeg@tmm1.net>:
> > >
> > > From: Aman Gupta <aman@tmm1.net>
> > >
> > > These are simple algorithms which can be run efficiently
> > > on low powered devices to produce deinteraced images.
> >
> > Please provide some numbers about the performance
> > (and subjective visual quality) of the new C code in
> > comparison to existing deinterlacers in FFmpeg.
> >
>
> Comparison of visual quality can be seen on VLC's website:
> https://wiki.videolan.org/Deinterlacing
>
> Regarding performance- none of the filters currently available in ffmpeg
> are fast enough to deinterlace video in real time on ARM chips used by
> popular Android or iOS devices. They're all very computationally expensive,
> and do not have any ARM SIMD implementations. The deinterlacers from VLC
> use simple mathematical averages optimized by SIMD, and have been used by
> VLC on such devices for many years. I don't have any hard numbers to share,
> but in my experience I can decode+deinterlace video for real time playback
> in VLC on any cheap Android phone, whereas other ffmpeg-based players
> cannot.
>

None of those algorithms are really worth using, none are actual
"deinterlacers". Blend and Mean are just plain out terrible, and the
other options are just dumb bob'ers which you can do with avfilter
as-is today with a combination of the separatefields filter (which is
zero-copy based on frame metadata only) and optional scaling
afterwards.

- Hendrik
Aman Gupta Sept. 9, 2019, 10:31 p.m.
On Mon, Sep 9, 2019 at 3:19 PM Hendrik Leppkes <h.leppkes@gmail.com> wrote:

> On Tue, Sep 10, 2019 at 12:00 AM Aman Gupta <ffmpeg@tmm1.net> wrote:
> >
> > On Mon, Sep 9, 2019 at 2:47 PM Carl Eugen Hoyos <ceffmpeg@gmail.com>
> wrote:
> >
> > > Am Mo., 9. Sept. 2019 um 22:19 Uhr schrieb Aman Gupta <ffmpeg@tmm1.net
> >:
> > > >
> > > > From: Aman Gupta <aman@tmm1.net>
> > > >
> > > > These are simple algorithms which can be run efficiently
> > > > on low powered devices to produce deinteraced images.
> > >
> > > Please provide some numbers about the performance
> > > (and subjective visual quality) of the new C code in
> > > comparison to existing deinterlacers in FFmpeg.
> > >
> >
> > Comparison of visual quality can be seen on VLC's website:
> > https://wiki.videolan.org/Deinterlacing
> >
> > Regarding performance- none of the filters currently available in ffmpeg
> > are fast enough to deinterlace video in real time on ARM chips used by
> > popular Android or iOS devices. They're all very computationally
> expensive,
> > and do not have any ARM SIMD implementations. The deinterlacers from VLC
> > use simple mathematical averages optimized by SIMD, and have been used by
> > VLC on such devices for many years. I don't have any hard numbers to
> share,
> > but in my experience I can decode+deinterlace video for real time
> playback
> > in VLC on any cheap Android phone, whereas other ffmpeg-based players
> > cannot.
> >
>
> None of those algorithms are really worth using, none are actual
> "deinterlacers". Blend and Mean are just plain out terrible, and the
> other options are just dumb bob'ers which you can do with avfilter
> as-is today with a combination of the separatefields filter (which is
> zero-copy based on frame metadata only) and optional scaling
> afterwards.
>

I don't disagree that many of them are overly simplistic. I only copied
them all for completeness sake.

However, as terrible as they may be they're not as bad as displaying
interlaced frames directly. Blend and Linear produce acceptable image
quality imho.

Linear averages lines from both fields to generate a new image. Is
something like this possible with any existing filter combined with
separatefields?

Aman


>
> - Hendrik
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Carl Eugen Hoyos Sept. 9, 2019, 10:39 p.m.
Am Di., 10. Sept. 2019 um 00:00 Uhr schrieb Aman Gupta <ffmpeg@tmm1.net>:
>
> On Mon, Sep 9, 2019 at 2:47 PM Carl Eugen Hoyos <ceffmpeg@gmail.com> wrote:
>
> > Am Mo., 9. Sept. 2019 um 22:19 Uhr schrieb Aman Gupta <ffmpeg@tmm1.net>:
> > >
> > > From: Aman Gupta <aman@tmm1.net>
> > >
> > > These are simple algorithms which can be run efficiently
> > > on low powered devices to produce deinteraced images.
> >
> > Please provide some numbers about the performance
> > (and subjective visual quality) of the new C code in
> > comparison to existing deinterlacers in FFmpeg.
> >
>
> Comparison of visual quality can be seen on VLC's website:
> https://wiki.videolan.org/Deinterlacing
>
> Regarding performance- none of the filters currently available in ffmpeg
> are fast enough to deinterlace video in real time on ARM chips used by
> popular Android or iOS devices.

That was not my question and I believe the commit message
absolutely needs some hints about the performance and the
quality.

Carl Eugen

Patch hide | download patch | download mbox

diff --git a/doc/filters.texi b/doc/filters.texi
index 6c81e1da40..55d9adeb81 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -9796,6 +9796,33 @@  fade=t=in:st=5.5:d=0.5
 
 @end itemize
 
+@section fastdeint
+Fast deinterlacing algorithms.
+
+@table @option
+@item mode
+Deinterlacing algorithm to use.
+
+It accepts the following values:
+@table @samp
+@item discard
+Discard bottom frame.
+
+@item mean
+Half resolution blender.
+
+@item blend
+Full resolution blender.
+
+@item bob
+Bob doubler.
+
+@item linear
+Bob doubler with linear interpolation.
+@end table
+
+@end table
+
 @section fftdnoiz
 Denoise frames using 3D FFT (frequency domain filtering).
 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 3ef4191d9a..a2b3566ec0 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -234,6 +234,7 @@  OBJS-$(CONFIG_EROSION_OPENCL_FILTER)         += vf_neighbor_opencl.o opencl.o \
                                                 opencl/neighbor.o
 OBJS-$(CONFIG_EXTRACTPLANES_FILTER)          += vf_extractplanes.o
 OBJS-$(CONFIG_FADE_FILTER)                   += vf_fade.o
+OBJS-$(CONFIG_FASTDEINT_FILTER)              += vf_fastdeint.o
 OBJS-$(CONFIG_FFTDNOIZ_FILTER)               += vf_fftdnoiz.o
 OBJS-$(CONFIG_FFTFILT_FILTER)                += vf_fftfilt.o
 OBJS-$(CONFIG_FIELD_FILTER)                  += vf_field.o
diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b58daa3a3f..2b0ad92893 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,3 +1,4 @@ 
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
+NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)         += aarch64/merge_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/merge_neon.S b/libavfilter/aarch64/merge_neon.S
new file mode 100644
index 0000000000..62377331a4
--- /dev/null
+++ b/libavfilter/aarch64/merge_neon.S
@@ -0,0 +1,98 @@ 
+/*
+ * Copyright (c) 2009-2016 Rémi Denis-Courmont, Janne Grunau, VLC authors
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define dest x0
+#define src1 x1
+#define src2 x2
+#define size x3
+
+        .align 2
+        // NOTE: Offset and pitch must be multiple of 16-bytes.
+function ff_merge8_neon, export=1
+        ands            x5, size, #~63
+        b.eq            2f
+        mov             x10, #64
+        add             x11, src1, #32
+        add             x12, src2, #32
+1:
+        ld1             {v0.16b,v1.16b}, [src1], x10
+        ld1             {v4.16b,v5.16b}, [src2], x10
+        ld1             {v2.16b,v3.16b}, [x11], x10
+        uhadd           v0.16b, v0.16b, v4.16b
+        ld1             {v6.16b,v7.16b}, [x12], x10
+        subs            x5, x5, #64
+        uhadd           v1.16b, v1.16b, v5.16b
+        uhadd           v2.16b, v2.16b, v6.16b
+        uhadd           v3.16b, v3.16b, v7.16b
+        st1             {v0.16b,v1.16b}, [dest], #32
+        st1             {v2.16b,v3.16b}, [dest], #32
+        b.gt            1b
+2:
+        tbz             size, #5,  3f
+        ld1             {v0.16b,v1.16b}, [src1], #32
+        ld1             {v4.16b,v5.16b}, [src2], #32
+        uhadd           v0.16b, v0.16b, v4.16b
+        uhadd           v1.16b, v1.16b, v5.16b
+        st1             {v0.16b,v1.16b}, [dest], #32
+3:
+        tbz             size, #4, 4f
+        ld1             {v0.16b}, [src1]
+        ld1             {v4.16b}, [src2]
+        uhadd           v0.16b, v0.16b, v4.16b
+        st1             {v0.16b}, [dest]
+4:
+        ret
+endfunc
+
+        .align 2
+function ff_merge16_neon, export=1
+        ands            x5, size, #~63
+        b.eq            2f
+1:
+        ld1             {v0.8h,v1.8h}, [src1], #32
+        ld1             {v4.8h,v5.8h}, [src2], #32
+        ld1             {v2.8h,v3.8h}, [src1], #32
+        uhadd           v0.8h, v0.8h, v4.8h
+        ld1             {v6.8h,v7.8h}, [src2], #32
+        uhadd           v1.8h, v1.8h, v5.8h
+        uhadd           v2.8h, v2.8h, v6.8h
+        uhadd           v3.8h, v3.8h, v7.8h
+        st1             {v0.8h,v1.8h}, [dest], #32
+        st1             {v2.8h,v3.8h}, [dest], #32
+        subs            x5, x5, #64
+        b.gt            1b
+2:
+        tbz             size, #5, 3f
+        ld1             {v0.8h,v1.8h}, [src1], #32
+        ld1             {v4.8h,v5.8h}, [src2], #32
+        uhadd           v0.8h, v0.8h, v4.8h
+        uhadd           v1.8h, v1.8h, v5.8h
+        st1             {v0.8h,v1.8h}, [dest], #32
+3:
+        tbz             size, #4,  4f
+        ld1             {v0.8h}, [src1]
+        ld1             {v4.8h}, [src2]
+        uhadd           v0.8h, v0.8h,v4.8h
+        st1             {v0.8h}, [dest]
+4:
+        ret
+endfunc
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index b675c688ee..6631af2ffe 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -219,6 +219,7 @@  extern AVFilter ff_vf_erosion;
 extern AVFilter ff_vf_erosion_opencl;
 extern AVFilter ff_vf_extractplanes;
 extern AVFilter ff_vf_fade;
+extern AVFilter ff_vf_fastdeint;
 extern AVFilter ff_vf_fftdnoiz;
 extern AVFilter ff_vf_fftfilt;
 extern AVFilter ff_vf_field;
diff --git a/libavfilter/arm/Makefile b/libavfilter/arm/Makefile
new file mode 100644
index 0000000000..c92d62fac9
--- /dev/null
+++ b/libavfilter/arm/Makefile
@@ -0,0 +1,3 @@ 
+ARMV6-OBJS-$(CONFIG_FASTDEINT_FILTER)  += arm/merge_armv6.o
+
+NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)   += arm/merge_neon.o
diff --git a/libavfilter/arm/merge_armv6.S b/libavfilter/arm/merge_armv6.S
new file mode 100644
index 0000000000..9b551c2c6c
--- /dev/null
+++ b/libavfilter/arm/merge_armv6.S
@@ -0,0 +1,70 @@ 
+/*
+ * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define dest r0
+#define src1 r1
+#define src2 r2
+#define size r3
+
+        .align 2
+function ff_merge8_armv6, export=1
+        push            {r4-r9,lr}
+1:
+        pld             [src1, #64]
+        ldm             src1!, {r4-r5}
+        pld             [src2, #64]
+        ldm             src2!, {r8-r9}
+        subs            size, size, #16
+        uhadd8          r4, r4, r8
+        ldm             src1!, {r6-r7}
+        uhadd8          r5, r5, r9
+        ldm             src2!, {ip,lr}
+        uhadd8          r6, r6, ip
+        stm             dest!, {r4-r5}
+        uhadd8          r7, r7, lr
+        stm             dest!, {r6-r7}
+        it              eq
+        popeq           {r4-r9,pc}
+        b               1b
+endfunc
+
+        .align 2
+function ff_merge16_armv6, export=1
+        push            {r4-r9,lr}
+1:
+        pld             [src1, #64]
+        ldm             src1!, {r4-r5}
+        pld             [src2, #64]
+        ldm             src2!, {r8-r9}
+        subs            size, size, #16
+        uhadd16         r4, r4, r8
+        ldm             src1!, {r6-r7}
+        uhadd16         r5, r5, r9
+        ldm             src2!, {ip,lr}
+        uhadd16         r6, r6, ip
+        stm             dest!, {r4-r5}
+        uhadd16         r7, r7, lr
+        stm             dest!, {r6-r7}
+        it              eq
+        popeq           {r4-r9,pc}
+        b               1b
+endfunc
\ No newline at end of file
diff --git a/libavfilter/arm/merge_neon.S b/libavfilter/arm/merge_neon.S
new file mode 100644
index 0000000000..ae36cf3ca9
--- /dev/null
+++ b/libavfilter/arm/merge_neon.S
@@ -0,0 +1,109 @@ 
+/*
+ * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define dest r0
+#define src1 r1
+#define src2 r2
+#define size r3
+
+        .align 2
+        @ NOTE: Offset and pitch must be multiple of 16-bytes.
+function ff_merge8_neon, export=1
+        cmp             size, #64
+        blo             2f
+1:
+        pld             [src1, #64]
+        vld1.u8         {q0-q1}, [src1,:128]!
+        pld             [src2, #64]
+        vld1.u8         {q8-q9}, [src2,:128]!
+        vhadd.u8        q0, q0, q8
+        sub             size, size, #64
+        vld1.u8         {q2-q3}, [src1,:128]!
+        vhadd.u8        q1, q1, q9
+        vld1.u8         {q10-q11}, [src2,:128]!
+        vhadd.u8        q2, q2, q10
+        cmp             size, #64
+        vhadd.u8        q3, q3, q11
+        vst1.u8         {q0-q1}, [dest,:128]!
+        vst1.u8         {q2-q3}, [dest,:128]!
+        bhs             1b
+2:
+        cmp             size, #32
+        blo             3f
+        vld1.u8         {q0-q1}, [src1,:128]!
+        sub             size, size, #32
+        vld1.u8         {q8-q9}, [src2,:128]!
+        vhadd.u8        q0, q0, q8
+        vhadd.u8        q1, q1, q9
+        vst1.u8         {q0-q1}, [dest,:128]!
+3:
+        cmp             size, #16
+        it              lo
+        bxlo            lr
+        vld1.u8         {q0}, [src1,:128]!
+        sub             size, size, #16
+        vld1.u8         {q8}, [src2,:128]!
+        vhadd.u8        q0, q0, q8
+        vst1.u8         {q0}, [dest,:128]!
+        bx              lr
+endfunc
+
+        .align 2
+function ff_merge16_neon, export=1
+        cmp             size, #64
+        blo             2f
+1:
+        pld             [src1, #64]
+        vld1.u16        {q0-q1}, [src1,:128]!
+        pld             [src2, #64]
+        vld1.u16        {q8-q9}, [src2,:128]!
+        vhadd.u16       q0, q0, q8
+        sub             size, size, #64
+        vld1.u16        {q2-q3}, [src1,:128]!
+        vhadd.u16       q1, q1, q9
+        vld1.u16        {q10-q11}, [src2,:128]!
+        vhadd.u16       q2, q2, q10
+        cmp             size, #64
+        vhadd.u16       q3, q3, q11
+        vst1.u16        {q0-q1}, [dest,:128]!
+        vst1.u16        {q2-q3}, [dest,:128]!
+        bhs             1b
+2:
+        cmp             size, #32
+        blo             3f
+        vld1.u16        {q0-q1}, [src1,:128]!
+        sub             size, size, #32
+        vld1.u16        {q8-q9}, [src2,:128]!
+        vhadd.u16       q0, q0, q8
+        vhadd.u16       q1, q1, q9
+        vst1.u16        {q0-q1}, [dest,:128]!
+3:
+        cmp             size, #16
+        it              lo
+        bxlo            lr
+        vld1.u16        {q0}, [src1,:128]!
+        sub             size, size, #16
+        vld1.u16        {q8}, [src2,:128]!
+        vhadd.u16       q0, q0, q8
+        vst1.u16        {q0}, [dest,:128]!
+        bx              lr
+endfunc
\ No newline at end of file
diff --git a/libavfilter/vf_fastdeint.c b/libavfilter/vf_fastdeint.c
new file mode 100644
index 0000000000..5ddd8be392
--- /dev/null
+++ b/libavfilter/vf_fastdeint.c
@@ -0,0 +1,588 @@ 
+/*
+ * Copyright (C) 2015 Aman Gupta <aman@tmm1.net>
+ *               2000-2011 VLC authors and VideoLAN
+ *
+ * Author: Sam Hocevar <sam@zoy.org>
+ *         Damien Lucas <nitrox@videolan.org>
+ *         Laurent Aimar <fenrir@videolan.org>
+ *         Sigmund Augdal Helberg <sigmunau@videolan.org>
+ *
+ * These algorithms are derived from the VLC project's
+ * modules/video_filter/deinterlace/algo_basic.c
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/cpu.h"
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/timestamp.h"
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+enum Mode {
+  MODE_DISCARD,
+  MODE_MEAN,
+  MODE_BLEND,
+  MODE_BOB,
+  MODE_LINEAR,
+  MODE_MAX,
+};
+
+typedef void (*merge_fn)(void *dst, const void *src1, const void *src2, size_t len);
+
+typedef struct FastDeintContext {
+    const AVClass *class;
+    merge_fn merge;
+    int merge_size;
+    int merge_aligned;
+    AVFrame *cur, *next;
+    enum Mode mode;
+    int eof;
+} FastDeintContext;
+
+static void merge8_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
+{
+    for (; bytes > 0; bytes--)
+        *dst++ = ( *src1++ + *src2++ ) >> 1;
+}
+
+static void merge16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
+{
+    for (size_t words = bytes / 2; words > 0; words--)
+        *dst++ = ( *src1++ + *src2++ ) >> 1;
+}
+
+static void merge8_unaligned(FastDeintContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
+{
+    if (s->merge_aligned) {
+        size_t remainder = bytes % 16;
+        if (remainder > 0) {
+            merge8_c(dst, src1, src2, remainder);
+            bytes -= remainder;
+            dst += remainder;
+            src1 += remainder;
+            src2 += remainder;
+        }
+    }
+    s->merge(dst, src1, src2, bytes);
+}
+
+static void merge16_unaligned(FastDeintContext *s, uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
+{
+    if (s->merge_aligned) {
+        size_t words = bytes / 2;
+        size_t remainder = words % 8;
+        if (remainder > 0) {
+            merge16_c(dst, src1, src2, remainder);
+            words -= remainder;
+            dst += remainder;
+            src1 += remainder;
+            src2 += remainder;
+        }
+    }
+    s->merge(dst, src1, src2, bytes);
+}
+
+static void merge_unaligned(FastDeintContext *s, void *dst, const void *src1, const void *src2, size_t bytes)
+{
+    if (s->merge_size == 16)
+        merge16_unaligned(s, dst, src1, src2, bytes);
+    else
+        merge8_unaligned(s, dst, src1, src2, bytes);
+}
+
+#if HAVE_SSE2_INLINE && defined(__x86_64__)
+static void merge8_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
+{
+    for(; bytes > 0 && ((uintptr_t)src1 & 15); bytes--)
+        *dst++ = ( *src1++ + *src2++ ) >> 1;
+
+    for (; bytes >= 16; bytes -= 16) {
+        __asm__  __volatile__( "movdqu %2,%%xmm1;"
+                               "pavgb %1, %%xmm1;"
+                               "movdqu %%xmm1, %0" :"=m" (*dst):
+                                                 "m" (*src1),
+                                                 "m" (*src2) : "xmm1" );
+        dst += 16;
+        src1 += 16;
+        src2 += 16;
+    }
+
+    if (bytes > 0) {
+        merge8_c(dst, src1, src2, bytes);
+    }
+}
+static void merge16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
+{
+    size_t words = bytes / 2;
+
+    for(; words > 0 && ((uintptr_t)src1 & 15); words--)
+        *dst++ = ( *src1++ + *src2++ ) >> 1;
+
+    for (; words >= 8; words -= 8) {
+        __asm__  __volatile__( "movdqu %2,%%xmm1;"
+                               "pavgw %1, %%xmm1;"
+                               "movdqu %%xmm1, %0" :"=m" (*dst):
+                                                 "m" (*src1),
+                                                 "m" (*src2) : "xmm1" );
+        dst += 8;
+        src1 += 8;
+        src2 += 8;
+    }
+
+    if (words > 0) {
+        merge16_c(dst, src1, src2, words * 2);
+    }
+}
+#define merge8 merge8_sse2
+#define merge16 merge16_sse2
+#else
+#define merge8 merge8_c
+#define merge16 merge16_c
+#endif
+
+static void render_image_single(FastDeintContext *s, AVFrame *out, AVFrame *frame)
+{
+    int i, planes_nb = 0;
+    enum Mode mode = s->mode;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
+
+    for (i = 0; i < desc->nb_components; i++)
+        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+
+    for (i = 0; i < planes_nb; i++) {
+        int height, bwidth;
+        int dst_linesize, src_linesize;
+        const uint8_t *src;
+        uint8_t *dst;
+
+        bwidth = av_image_get_linesize(out->format, out->width, i);
+        if (bwidth < 0) {
+            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
+            return;
+        }
+
+        height = out->height;
+        if (i == 1 || i == 2) {
+            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
+        }
+
+        src = frame->data[i];
+        dst = out->data[i];
+        dst_linesize = out->linesize[i];
+        src_linesize = frame->linesize[i];
+
+        if (mode == MODE_BLEND) {
+            // Copy first line
+            memcpy(dst, src, bwidth);
+            dst += dst_linesize;
+            height--;
+        }
+
+        // Merge remaining lines
+        for (; height > 0; height--) {
+            if (mode == MODE_DISCARD)
+                memcpy(dst, src, bwidth);
+            else
+                merge_unaligned(s, dst, src, src + src_linesize, bwidth);
+            dst += dst_linesize;
+            src += src_linesize;
+            if (mode == MODE_MEAN || mode == MODE_DISCARD) {
+                src += src_linesize;
+                height--;
+            }
+        }
+    }
+    if (mode != MODE_DISCARD)
+        emms_c();
+}
+
+static void render_image_doubler(FastDeintContext *s, AVFrame *out, AVFrame *frame, int field)
+{
+    int i, planes_nb = 0;
+    enum Mode mode = s->mode;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
+
+    for (i = 0; i < desc->nb_components; i++)
+        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+
+    for (i = 0; i < planes_nb; i++) {
+        int height, bwidth;
+        int dst_linesize, src_linesize;
+        const uint8_t *src;
+        uint8_t *dst;
+
+        bwidth = av_image_get_linesize(out->format, out->width, i);
+        if (bwidth < 0) {
+            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
+            return;
+        }
+        height = out->height;
+        if (i == 1 || i == 2) {
+            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
+        }
+
+        src = frame->data[i];
+        dst = out->data[i];
+        src_linesize = frame->linesize[i];
+        dst_linesize = out->linesize[i];
+
+        // For BOTTOM field we need to add the first line
+        if (field == 1) {
+            memcpy(dst, src, bwidth);
+            dst += dst_linesize;
+            src += src_linesize;
+            height--;
+        }
+
+        height -= 2;
+
+        for (; height > 0; height-=2) {
+            memcpy(dst, src, bwidth);
+            dst += dst_linesize;
+
+            if (mode == MODE_LINEAR)
+                merge_unaligned(s, dst, src, src + 2 * src_linesize, bwidth);
+            else
+                memcpy(dst, src, bwidth);
+            dst += dst_linesize;
+
+            src += src_linesize * 2;
+        }
+
+        memcpy(dst, src, bwidth);
+
+        // For TOP field we need to add the last line
+        if (field == 0)
+        {
+            dst += dst_linesize;
+            src += src_linesize;
+            memcpy(dst, src, bwidth);
+        }
+    }
+    if (mode == MODE_LINEAR)
+        emms_c();
+}
+
+static int filter_frame_single(AVFilterLink *link, AVFrame *frame)
+{
+    AVFilterContext *ctx = link->dst;
+    AVFrame *out;
+    FastDeintContext *s = ctx->priv;
+
+    if (!frame->interlaced_frame) {
+        return ff_filter_frame(ctx->outputs[0], frame);
+    }
+
+    out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
+    if (!out) {
+        av_frame_free(&frame);
+        return AVERROR(ENOMEM);
+    }
+
+    av_frame_copy_props(out, frame);
+    out->interlaced_frame = 0;
+    render_image_single(s, out, frame);
+
+    av_frame_free(&frame);
+    return ff_filter_frame(ctx->outputs[0], out);
+}
+
+static AVFrame *copy_frame(AVFilterLink *link, AVFrame *frame)
+{
+    AVFilterContext *ctx = link->dst;
+    AVFrame *out;
+
+    if (frame->format == AV_PIX_FMT_VIDEOTOOLBOX)
+        out = av_frame_alloc();
+    else
+        out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
+
+    if (!out)
+        return NULL;
+
+    av_frame_copy_props(out, frame);
+    return out;
+}
+
+static int filter_frame_double(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext *ctx = link->dst;
+    FastDeintContext *s = ctx->priv;
+    AVFrame *frame, *out, *out2;
+    int tff, ret;
+
+    s->cur = s->next;
+    s->next = in;
+
+    if (!s->cur) {
+        return 0;
+    }
+
+    frame = s->cur;
+
+    if (!frame->interlaced_frame) {
+        if (frame->pts != AV_NOPTS_VALUE)
+            frame->pts *= 2;
+        s->cur = NULL;
+        return ff_filter_frame(ctx->outputs[0], frame);
+    }
+
+    tff = frame->top_field_first;
+    out = copy_frame(link, frame);
+    if (!out) {
+        av_frame_free(&frame);
+        s->cur = NULL;
+        return AVERROR(ENOMEM);
+    }
+
+    out->interlaced_frame = 0;
+    if (out->pts != AV_NOPTS_VALUE)
+        out->pts = out->pts * 2;
+    render_image_doubler(s, out, frame, !tff);
+
+    ret = ff_filter_frame(ctx->outputs[0], out);
+    if (ret < 0) {
+        av_frame_free(&frame);
+        s->cur = NULL;
+        return ret;
+    }
+
+    out2 = copy_frame(link, frame);
+    if (!out2) {
+        av_frame_free(&frame);
+        s->cur = NULL;
+        return AVERROR(ENOMEM);
+    }
+
+    out2->interlaced_frame = 0;
+    av_frame_remove_side_data(out2, AV_FRAME_DATA_A53_CC);
+    if (out2->pts != AV_NOPTS_VALUE) {
+        out2->pts = frame->pts + s->next->pts;
+    }
+    render_image_doubler(s, out2, frame, tff);
+
+    av_frame_free(&frame);
+    s->cur = NULL;
+
+    return ff_filter_frame(ctx->outputs[0], out2);
+}
+
+static int filter_frame(AVFilterLink *link, AVFrame *frame)
+{
+    AVFilterContext *ctx = link->dst;
+    FastDeintContext *s = ctx->priv;
+
+    av_assert0(frame);
+
+    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
+        return filter_frame_double(link, frame);
+    } else {
+        return filter_frame_single(link, frame);
+    }
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    FastDeintContext *s = ctx->priv;
+    av_frame_free(&s->cur);
+    av_frame_free(&s->next);
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUV410P,
+        AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUVJ420P,
+        AV_PIX_FMT_YUVJ422P,
+        AV_PIX_FMT_YUVJ444P,
+        AV_PIX_FMT_GRAY16,
+        AV_PIX_FMT_YUV440P,
+        AV_PIX_FMT_YUVJ440P,
+        AV_PIX_FMT_YUV420P9,
+        AV_PIX_FMT_YUV422P9,
+        AV_PIX_FMT_YUV444P9,
+        AV_PIX_FMT_YUV420P10,
+        AV_PIX_FMT_YUV422P10,
+        AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUV420P12,
+        AV_PIX_FMT_YUV422P12,
+        AV_PIX_FMT_YUV444P12,
+        AV_PIX_FMT_YUV420P14,
+        AV_PIX_FMT_YUV422P14,
+        AV_PIX_FMT_YUV444P14,
+        AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P16,
+        AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P,
+        AV_PIX_FMT_YUVA422P,
+        AV_PIX_FMT_YUVA444P,
+        AV_PIX_FMT_GBRP,
+        AV_PIX_FMT_GBRP9,
+        AV_PIX_FMT_GBRP10,
+        AV_PIX_FMT_GBRP12,
+        AV_PIX_FMT_GBRP14,
+        AV_PIX_FMT_GBRP16,
+        AV_PIX_FMT_GBRAP,
+        AV_PIX_FMT_NONE
+    };
+
+    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+#if ARCH_ARM
+#include "libavutil/arm/cpu.h"
+#endif
+#if ARCH_AARCH64
+#include "libavutil/aarch64/cpu.h"
+#endif
+#if ARCH_AARCH64 || ARCH_ARM
+void ff_merge8_neon(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes);
+void ff_merge16_neon(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes);
+void ff_merge8_armv6(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes);
+void ff_merge16_armv6(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes);
+#endif
+
+static int config_props(AVFilterLink *link)
+{
+    AVFilterContext *ctx = link->src;
+    FastDeintContext *s = ctx->priv;
+    const AVPixFmtDescriptor *pix;
+#if ARCH_AARCH64 || ARCH_ARM
+    int cpu_flags = av_get_cpu_flags();
+#endif
+
+    link->w = link->src->inputs[0]->w;
+    link->h = link->src->inputs[0]->h;
+    link->time_base  = link->src->inputs[0]->time_base;
+    link->frame_rate = link->src->inputs[0]->frame_rate;
+    link->sample_aspect_ratio = link->src->inputs[0]->sample_aspect_ratio;
+
+    if (s->mode == MODE_MEAN || s->mode == MODE_DISCARD) {
+        link->h /= 2;
+        link->sample_aspect_ratio = av_mul_q(link->sample_aspect_ratio, av_make_q(1, 2));
+    }
+    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
+        link->time_base  = av_mul_q(link->time_base,  av_make_q(1, 2));
+        link->frame_rate = av_mul_q(link->frame_rate, av_make_q(2, 1));
+    }
+
+    pix = av_pix_fmt_desc_get(link->format);
+    s->merge_size = (pix->comp[0].depth > 8) ? 16 : 8;
+    s->merge = s->merge_size == 16 ? (merge_fn)merge16 : (merge_fn)merge8;
+
+#if ARCH_ARM
+    if (have_armv6(cpu_flags)) {
+        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_armv6 : (merge_fn)ff_merge8_armv6;
+        s->merge_aligned = 1;
+    }
+#endif
+#if ARCH_AARCH64 || ARCH_ARM
+    if (have_neon(cpu_flags)) {
+        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_neon : (merge_fn)ff_merge8_neon;
+        s->merge_aligned = 1;
+    }
+#endif
+
+    return 0;
+}
+
+static int request_frame(AVFilterLink *link)
+{
+    AVFilterContext *ctx = link->src;
+    FastDeintContext *s = ctx->priv;
+    int ret;
+
+    if (s->eof)
+        return AVERROR_EOF;
+
+    ret = ff_request_frame(ctx->inputs[0]);
+
+    if (ret == AVERROR_EOF && s->cur) {
+        AVFrame *next = av_frame_clone(s->next);
+        if (!next)
+            return AVERROR(ENOMEM);
+
+        next->pts = s->next->pts * 2 - s->cur->pts;
+        filter_frame(ctx->inputs[0], next);
+        s->eof = 1;
+    } else if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(FastDeintContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+
+#define CONST(name, help, val, unit) { name, help, 0, AV_OPT_TYPE_CONST, {.i64=val}, INT_MIN, INT_MAX, FLAGS, unit }
+
+static const AVOption fastdeint_options[] = {
+    { "mode", "specify the deinterlacing mode", OFFSET(mode), AV_OPT_TYPE_INT, {.i64=MODE_BLEND}, 0, MODE_MAX-1, FLAGS, "mode" },
+    CONST("discard", "discard bottom frame", MODE_DISCARD, "mode"),
+    CONST("mean", "half resolution blender", MODE_MEAN, "mode"),
+    CONST("blend", "full resolution blender", MODE_BLEND, "mode"),
+    CONST("bob", "bob doubler", MODE_BOB, "mode"),
+    CONST("linear", "bob doubler with linear interpolation", MODE_LINEAR, "mode"),
+
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(fastdeint);
+
+static const AVFilterPad fastdeint_inputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .filter_frame  = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad fastdeint_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = config_props,
+        .request_frame = request_frame
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_fastdeint = {
+    .name          = "fastdeint",
+    .description   = NULL_IF_CONFIG_SMALL("fast deinterlacing algorithms"),
+    .priv_size     = sizeof(FastDeintContext),
+    .priv_class    = &fastdeint_class,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .inputs        = fastdeint_inputs,
+    .outputs       = fastdeint_outputs,
+};