diff mbox

[FFmpeg-devel] avfilter/vf_overlay: add x86 SIMD

Message ID 20180501080221.31362-1-onemda@gmail.com
State Superseded
Headers show

Commit Message

Paul B Mahol May 1, 2018, 8:02 a.m. UTC
Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
is straight.

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/vf_overlay.c          |  75 +++++-------------
 libavfilter/vf_overlay.h          |  85 +++++++++++++++++++++
 libavfilter/x86/Makefile          |   2 +
 libavfilter/x86/vf_overlay.asm    | 157 ++++++++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_overlay_init.c |  63 +++++++++++++++
 5 files changed, 326 insertions(+), 56 deletions(-)
 create mode 100644 libavfilter/vf_overlay.h
 create mode 100644 libavfilter/x86/vf_overlay.asm
 create mode 100644 libavfilter/x86/vf_overlay_init.c

Comments

Michael Niedermayer May 1, 2018, 8:20 a.m. UTC | #1
On Tue, May 01, 2018 at 10:02:21AM +0200, Paul B Mahol wrote:
> Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
> is straight.
> 
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/vf_overlay.c          |  75 +++++-------------
>  libavfilter/vf_overlay.h          |  85 +++++++++++++++++++++
>  libavfilter/x86/Makefile          |   2 +
>  libavfilter/x86/vf_overlay.asm    | 157 ++++++++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_overlay_init.c |  63 +++++++++++++++
>  5 files changed, 326 insertions(+), 56 deletions(-)
>  create mode 100644 libavfilter/vf_overlay.h
>  create mode 100644 libavfilter/x86/vf_overlay.asm
>  create mode 100644 libavfilter/x86/vf_overlay_init.c

breaks build on x86-32

src/libavfilter/x86/vf_overlay.asm:36: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:47: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:48: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:49: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:57: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:58: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:59: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:63: error: symbol `r7d' undefined
src/libavfilter/x86/vf_overlay.asm:68: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:81: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:82: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:84: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:89: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:93: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:101: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:102: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:103: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:107: error: symbol `r7d' undefined
src/libavfilter/x86/vf_overlay.asm:112: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:127: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:128: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:130: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:134: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:137: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:142: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:150: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:151: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:152: error: symbol `r7q' undefined
src/libavfilter/x86/vf_overlay.asm:156: error: symbol `r7d' undefined
make: *** [libavfilter/x86/vf_overlay.o] Error 1
make: *** Waiting for unfinished jobs....



[...]
Paul B Mahol May 1, 2018, 8:45 a.m. UTC | #2
On 5/1/18, Michael Niedermayer <michael@niedermayer.cc> wrote:
> On Tue, May 01, 2018 at 10:02:21AM +0200, Paul B Mahol wrote:
>> Specifically for yuv444, yuv422, yuv420 format when main stream has no
>> alpha, and alpha
>> is straight.
>>
>> Signed-off-by: Paul B Mahol <onemda@gmail.com>
>> ---
>>  libavfilter/vf_overlay.c          |  75 +++++-------------
>>  libavfilter/vf_overlay.h          |  85 +++++++++++++++++++++
>>  libavfilter/x86/Makefile          |   2 +
>>  libavfilter/x86/vf_overlay.asm    | 157
>> ++++++++++++++++++++++++++++++++++++++
>>  libavfilter/x86/vf_overlay_init.c |  63 +++++++++++++++
>>  5 files changed, 326 insertions(+), 56 deletions(-)
>>  create mode 100644 libavfilter/vf_overlay.h
>>  create mode 100644 libavfilter/x86/vf_overlay.asm
>>  create mode 100644 libavfilter/x86/vf_overlay_init.c
>
> breaks build on x86-32
>
> src/libavfilter/x86/vf_overlay.asm:36: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:47: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:48: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:49: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:57: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:58: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:59: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:63: error: symbol `r7d' undefined
> src/libavfilter/x86/vf_overlay.asm:68: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:81: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:82: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:84: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:89: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:93: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:101: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:102: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:103: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:107: error: symbol `r7d' undefined
> src/libavfilter/x86/vf_overlay.asm:112: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:127: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:128: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:130: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:134: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:137: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:142: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:150: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:151: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:152: error: symbol `r7q' undefined
> src/libavfilter/x86/vf_overlay.asm:156: error: symbol `r7d' undefined
> make: *** [libavfilter/x86/vf_overlay.o] Error 1
> make: *** Waiting for unfinished jobs....

Fixed locally.
James Almer May 1, 2018, 7:46 p.m. UTC | #3
On 5/1/2018 5:02 AM, Paul B Mahol wrote:
> Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
> is straight.
> 
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/vf_overlay.c          |  75 +++++-------------
>  libavfilter/vf_overlay.h          |  85 +++++++++++++++++++++
>  libavfilter/x86/Makefile          |   2 +
>  libavfilter/x86/vf_overlay.asm    | 157 ++++++++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_overlay_init.c |  63 +++++++++++++++
>  5 files changed, 326 insertions(+), 56 deletions(-)
>  create mode 100644 libavfilter/vf_overlay.h
>  create mode 100644 libavfilter/x86/vf_overlay.asm
>  create mode 100644 libavfilter/x86/vf_overlay_init.c

[...]

> diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
> new file mode 100644
> index 0000000000..d639cce9e5
> --- /dev/null
> +++ b/libavfilter/x86/vf_overlay.asm
> @@ -0,0 +1,157 @@
> +;*****************************************************************************
> +;* x86-optimized functions for overlay filter
> +;*
> +;* Copyright (C) 2018 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_128:   times 8 dw 128
> +pw_255:   times 8 dw 255
> +pw_257:   times 8 dw 257
> +pw_65280: times 8 dw 65280
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal overlay_row_44, 6, 8, 6, 0, d, da, s, a, w, alinesize, r, x

You're not using the alinesize parameter here. Make this 5, 7, 8 and use
that reg for r. That way this can work on x86_32.

Also, pointless 0 after xmm reg amount. Just remove it.

> +    xor          xq, xq
> +    movsxdifnidn wq, wd
> +    mov          rq, wq
> +    and          rq, mmsize/2 - 1
> +    cmp          wq, mmsize/2
> +    jl .end
> +    sub          wq, rq
> +    mova         m3, [pw_255]
> +    mova         m4, [pw_128]
> +    mova         m5, [pw_257]
> +    .loop0:
> +        pmovzxbw    m0, [sq+xq]
> +        pmovzxbw    m2, [aq+xq]
> +        pmovzxbw    m1, [dq+xq]
> +        pmullw      m0, m2
> +        pxor        m2, m3
> +        pmullw      m1, m2
> +        paddw       m0, m4
> +        paddw       m0, m1
> +        pmulhuw     m0, m5
> +        packuswb    m0, m0
> +        movq   [dq+xq], m0
> +        add         xq, mmsize/2
> +        cmp         xq, wq
> +        jl .loop0
> +
> +    .end:
> +    mov    eax, xd
> +    RET
> +
> +INIT_XMM sse4
> +cglobal overlay_row_22, 6, 8, 8, 0, d, da, s, a, w, al, r, x

Same here with al.

> +    xor          xq, xq
> +    movsxdifnidn wq, wd
> +    sub          wq, 1
> +    mov          rq, wq
> +    and          rq, mmsize/2 - 1
> +    cmp          wq, mmsize/2
> +    jl .end
> +    sub          wq, rq
> +    mova         m3, [pw_255]
> +    mova         m4, [pw_128]
> +    mova         m5, [pw_257]
> +    mova         m7, [pw_65280]
> +    .loop0:
> +        pmovzxbw    m0, [sq+xq]
> +        movu        m2, [aq+2*xq]
> +        pand        m2, m3
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        psrlw       m2, 1
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m3
> +        paddw       m2, m6
> +        psrlw       m2, 1
> +        pmovzxbw    m1, [dq+xq]
> +        pmullw      m0, m2
> +        pxor        m2, m3
> +        pmullw      m1, m2
> +        paddw       m0, m4
> +        paddw       m0, m1
> +        pmulhuw     m0, m5
> +        packuswb    m0, m0
> +        movq   [dq+xq], m0
> +        add         xq, mmsize/2
> +        cmp         xq, wq
> +        jl .loop0
> +
> +    .end:
> +    mov    eax, xd
> +    RET
> +
> +INIT_XMM sse4
> +cglobal overlay_row_20, 6, 8, 8, 0, d, da, s, a, w, al, r, x
> +    xor          xq, xq
> +    movsxdifnidn wq, wd
> +    sub          wq, 1
> +    mov          rq, wq
> +    and          rq, mmsize/2 - 1
> +    cmp          wq, mmsize/2
> +    jl .end
> +    sub          wq, rq
> +    mov         daq, aq
> +    add         daq, alq

Use al straight from memory here, and use the gpr for r, much like above.

> +    mova         m3, [pw_255]
> +    mova         m4, [pw_128]
> +    mova         m5, [pw_257]
> +    mova         m7, [pw_65280]
> +    .loop0:
> +        pmovzxbw    m0, [sq+xq]
> +        movu        m2, [aq+2*xq]
> +        pand        m2, m3
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        movu        m6, [daq+2*xq]
> +        pand        m6, m3
> +        paddw       m2, m6
> +        movu        m6, [daq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        psrlw       m2, 2
> +        pmovzxbw    m1, [dq+xq]
> +        pmullw      m0, m2
> +        pxor        m2, m3
> +        pmullw      m1, m2
> +        paddw       m0, m4
> +        paddw       m0, m1
> +        pmulhuw     m0, m5
> +        packuswb    m0, m0
> +        movq   [dq+xq], m0
> +        add         xq, mmsize/2
> +        cmp         xq, wq
> +        jl .loop0
> +
> +    .end:
> +    mov    eax, xd
> +    RET
> diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
> new file mode 100644
> index 0000000000..865fd035f6
> --- /dev/null
> +++ b/libavfilter/x86/vf_overlay_init.c
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2018 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/vf_overlay.h"
> +
> +int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> +                           int w, ptrdiff_t alinesize);
> +
> +int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> +                           int w, ptrdiff_t alinesize);
> +
> +int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> +                           int w, ptrdiff_t alinesize);
> +
> +av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> +        (format == OVERLAY_FORMAT_YUV444 ||
> +         format == OVERLAY_FORMAT_GBRP) &&
> +        alpha_format == 0 && main_has_alpha == 0) {
> +        s->blend_row[0] = ff_overlay_row_44_sse4;
> +        s->blend_row[1] = ff_overlay_row_44_sse4;
> +        s->blend_row[2] = ff_overlay_row_44_sse4;
> +    }
> +
> +    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> +        (format == OVERLAY_FORMAT_YUV420) &&
> +        alpha_format == 0 && main_has_alpha == 0) {
> +        s->blend_row[0] = ff_overlay_row_44_sse4;
> +        s->blend_row[1] = ff_overlay_row_20_sse4;
> +        s->blend_row[2] = ff_overlay_row_20_sse4;
> +    }
> +
> +    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> +        (format == OVERLAY_FORMAT_YUV422) &&
> +        alpha_format == 0 && main_has_alpha == 0) {
> +        s->blend_row[0] = ff_overlay_row_44_sse4;
> +        s->blend_row[1] = ff_overlay_row_22_sse4;
> +        s->blend_row[2] = ff_overlay_row_22_sse4;
> +    }

You can remove all the x86_64 checks after the changes described above.

> +}
>
Henrik Gramner May 1, 2018, 7:47 p.m. UTC | #4
On Tue, May 1, 2018 at 10:02 AM, Paul B Mahol <onemda@gmail.com> wrote:
> +cglobal overlay_row_22, 6, 8, 8, 0, d, da, s, a, w, al, r, x
[...]
> +        movu        m2, [aq+2*xq]
> +        pand        m2, m3
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        psrlw       m2, 1
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m3
> +        paddw       m2, m6
> +        psrlw       m2, 1

I believe this can be simplified to something like (untested):

    movu        m1, [aq+2*xq]
    pandn       m2, m3, m1
    psllw       m1, 8
    pavgw       m2, m1
    pavgw       m2, m1
    psrlw       m2, 8

> +cglobal overlay_row_20, 6, 8, 8, 0, d, da, s, a, w, al, r, x
[...]
> +        movu        m2, [aq+2*xq]
> +        pand        m2, m3
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        movu        m6, [daq+2*xq]
> +        pand        m6, m3
> +        paddw       m2, m6
> +        movu        m6, [daq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        psrlw       m2, 2

And this to (untested):

    mova        m6, [pb_1]
...
    movu        m2, [aq+2*xq]
    movu        m1, [daq+2*xq]
    pmaddubsw   m2, m6
    pmaddubsw   m1, m6
    paddw       m2, m1
    psrlw       m2, 2
diff mbox

Patch

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 8c1895cca4..c4d87306f1 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -39,6 +39,7 @@ 
 #include "drawutils.h"
 #include "framesync.h"
 #include "video.h"
+#include "vf_overlay.h"
 
 typedef struct ThreadData {
     AVFrame *dst, *src;
@@ -59,21 +60,6 @@  static const char *const var_names[] = {
     NULL
 };
 
-enum var_name {
-    VAR_MAIN_W,    VAR_MW,
-    VAR_MAIN_H,    VAR_MH,
-    VAR_OVERLAY_W, VAR_OW,
-    VAR_OVERLAY_H, VAR_OH,
-    VAR_HSUB,
-    VAR_VSUB,
-    VAR_X,
-    VAR_Y,
-    VAR_N,
-    VAR_POS,
-    VAR_T,
-    VAR_VARS_NB
-};
-
 #define MAIN    0
 #define OVERLAY 1
 
@@ -92,45 +78,6 @@  enum EvalMode {
     EVAL_MODE_NB
 };
 
-enum OverlayFormat {
-    OVERLAY_FORMAT_YUV420,
-    OVERLAY_FORMAT_YUV422,
-    OVERLAY_FORMAT_YUV444,
-    OVERLAY_FORMAT_RGB,
-    OVERLAY_FORMAT_GBRP,
-    OVERLAY_FORMAT_AUTO,
-    OVERLAY_FORMAT_NB
-};
-
-typedef struct OverlayContext {
-    const AVClass *class;
-    int x, y;                   ///< position of overlaid picture
-
-    uint8_t main_is_packed_rgb;
-    uint8_t main_rgba_map[4];
-    uint8_t main_has_alpha;
-    uint8_t overlay_is_packed_rgb;
-    uint8_t overlay_rgba_map[4];
-    uint8_t overlay_has_alpha;
-    int format;                 ///< OverlayFormat
-    int alpha_format;
-    int eval_mode;              ///< EvalMode
-
-    FFFrameSync fs;
-
-    int main_pix_step[4];       ///< steps per pixel for each plane of the main output
-    int overlay_pix_step[4];    ///< steps per pixel for each plane of the overlay
-    int hsub, vsub;             ///< chroma subsampling values
-    const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
-
-    double var_values[VAR_VARS_NB];
-    char *x_expr, *y_expr;
-
-    AVExpr *x_pexpr, *y_pexpr;
-
-    int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-} OverlayContext;
-
 static av_cold void uninit(AVFilterContext *ctx)
 {
     OverlayContext *s = ctx->priv;
@@ -509,6 +456,7 @@  static av_always_inline void blend_plane(AVFilterContext *ctx,
                                          int jobnr,
                                          int nb_jobs)
 {
+    OverlayContext *octx = ctx->priv;
     int src_wp = AV_CEIL_RSHIFT(src_w, hsub);
     int src_hp = AV_CEIL_RSHIFT(src_h, vsub);
     int dst_wp = AV_CEIL_RSHIFT(dst_w, hsub);
@@ -538,8 +486,18 @@  static av_always_inline void blend_plane(AVFilterContext *ctx,
         s = sp + k;
         a = ap + (k<<hsub);
         da = dap + ((xp+k) << hsub);
+        kmax = FFMIN(-xp + dst_wp, src_wp);
+
+        if (((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {
+            int c = octx->blend_row[i](d, da, s, a, kmax - k, src->linesize[3]);
 
-        for (kmax = FFMIN(-xp + dst_wp, src_wp); k < kmax; k++) {
+            s += c;
+            d += dst_step * c;
+            da += (1 << hsub) * c;
+            a += (1 << hsub) * c;
+            k += c;
+        }
+        for (; k < kmax; k++) {
             int alpha_v, alpha_h, alpha;
 
             // average alpha for color components, improve quality
@@ -916,7 +874,7 @@  static int config_input_main(AVFilterLink *inlink)
     }
 
     if (!s->alpha_format)
-        return 0;
+        goto end;
 
     switch (s->format) {
     case OVERLAY_FORMAT_YUV420:
@@ -960,6 +918,11 @@  static int config_input_main(AVFilterLink *inlink)
         }
         break;
     }
+
+end:
+    if (ARCH_X86)
+        ff_overlay_init_x86(s, s->format, s->alpha_format, s->main_has_alpha);
+
     return 0;
 }
 
diff --git a/libavfilter/vf_overlay.h b/libavfilter/vf_overlay.h
new file mode 100644
index 0000000000..072ece358f
--- /dev/null
+++ b/libavfilter/vf_overlay.h
@@ -0,0 +1,85 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_OVERLAY_H
+#define AVFILTER_OVERLAY_H
+
+#include "libavutil/eval.h"
+#include "libavutil/pixdesc.h"
+#include "framesync.h"
+#include "avfilter.h"
+
+enum var_name {
+    VAR_MAIN_W,    VAR_MW,
+    VAR_MAIN_H,    VAR_MH,
+    VAR_OVERLAY_W, VAR_OW,
+    VAR_OVERLAY_H, VAR_OH,
+    VAR_HSUB,
+    VAR_VSUB,
+    VAR_X,
+    VAR_Y,
+    VAR_N,
+    VAR_POS,
+    VAR_T,
+    VAR_VARS_NB
+};
+
+enum OverlayFormat {
+    OVERLAY_FORMAT_YUV420,
+    OVERLAY_FORMAT_YUV422,
+    OVERLAY_FORMAT_YUV444,
+    OVERLAY_FORMAT_RGB,
+    OVERLAY_FORMAT_GBRP,
+    OVERLAY_FORMAT_AUTO,
+    OVERLAY_FORMAT_NB
+};
+
+typedef struct OverlayContext {
+    const AVClass *class;
+    int x, y;                   ///< position of overlaid picture
+
+    uint8_t main_is_packed_rgb;
+    uint8_t main_rgba_map[4];
+    uint8_t main_has_alpha;
+    uint8_t overlay_is_packed_rgb;
+    uint8_t overlay_rgba_map[4];
+    uint8_t overlay_has_alpha;
+    int format;                 ///< OverlayFormat
+    int alpha_format;
+    int eval_mode;              ///< EvalMode
+
+    FFFrameSync fs;
+
+    int main_pix_step[4];       ///< steps per pixel for each plane of the main output
+    int overlay_pix_step[4];    ///< steps per pixel for each plane of the overlay
+    int hsub, vsub;             ///< chroma subsampling values
+    const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
+
+    double var_values[VAR_VARS_NB];
+    char *x_expr, *y_expr;
+
+    AVExpr *x_pexpr, *y_pexpr;
+
+    int (*blend_row[4])(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, int w,
+                        ptrdiff_t alinesize);
+    int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+} OverlayContext;
+
+void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha);
+
+#endif /* AVFILTER_OVERLAY_H */
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index f60de3b73b..b484c8bd1c 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -13,6 +13,7 @@  OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
 OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
+OBJS-$(CONFIG_OVERLAY_FILTER)                += x86/vf_overlay_init.o
 OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
 OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
@@ -41,6 +42,7 @@  X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
 X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
 X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
 X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
+X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
 X86ASM-OBJS-$(CONFIG_PP7_FILTER)             += x86/vf_pp7.o
 X86ASM-OBJS-$(CONFIG_PSNR_FILTER)            += x86/vf_psnr.o
 X86ASM-OBJS-$(CONFIG_PULLUP_FILTER)          += x86/vf_pullup.o
diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
new file mode 100644
index 0000000000..d639cce9e5
--- /dev/null
+++ b/libavfilter/x86/vf_overlay.asm
@@ -0,0 +1,157 @@ 
+;*****************************************************************************
+;* x86-optimized functions for overlay filter
+;*
+;* Copyright (C) 2018 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_128:   times 8 dw 128
+pw_255:   times 8 dw 255
+pw_257:   times 8 dw 257
+pw_65280: times 8 dw 65280
+
+SECTION .text
+
+INIT_XMM sse4
+cglobal overlay_row_44, 6, 8, 6, 0, d, da, s, a, w, alinesize, r, x
+    xor          xq, xq
+    movsxdifnidn wq, wd
+    mov          rq, wq
+    and          rq, mmsize/2 - 1
+    cmp          wq, mmsize/2
+    jl .end
+    sub          wq, rq
+    mova         m3, [pw_255]
+    mova         m4, [pw_128]
+    mova         m5, [pw_257]
+    .loop0:
+        pmovzxbw    m0, [sq+xq]
+        pmovzxbw    m2, [aq+xq]
+        pmovzxbw    m1, [dq+xq]
+        pmullw      m0, m2
+        pxor        m2, m3
+        pmullw      m1, m2
+        paddw       m0, m4
+        paddw       m0, m1
+        pmulhuw     m0, m5
+        packuswb    m0, m0
+        movq   [dq+xq], m0
+        add         xq, mmsize/2
+        cmp         xq, wq
+        jl .loop0
+
+    .end:
+    mov    eax, xd
+    RET
+
+INIT_XMM sse4
+cglobal overlay_row_22, 6, 8, 8, 0, d, da, s, a, w, al, r, x
+    xor          xq, xq
+    movsxdifnidn wq, wd
+    sub          wq, 1
+    mov          rq, wq
+    and          rq, mmsize/2 - 1
+    cmp          wq, mmsize/2
+    jl .end
+    sub          wq, rq
+    mova         m3, [pw_255]
+    mova         m4, [pw_128]
+    mova         m5, [pw_257]
+    mova         m7, [pw_65280]
+    .loop0:
+        pmovzxbw    m0, [sq+xq]
+        movu        m2, [aq+2*xq]
+        pand        m2, m3
+        movu        m6, [aq+2*xq]
+        pand        m6, m7
+        psrlw       m6, 8
+        paddw       m2, m6
+        psrlw       m2, 1
+        movu        m6, [aq+2*xq]
+        pand        m6, m3
+        paddw       m2, m6
+        psrlw       m2, 1
+        pmovzxbw    m1, [dq+xq]
+        pmullw      m0, m2
+        pxor        m2, m3
+        pmullw      m1, m2
+        paddw       m0, m4
+        paddw       m0, m1
+        pmulhuw     m0, m5
+        packuswb    m0, m0
+        movq   [dq+xq], m0
+        add         xq, mmsize/2
+        cmp         xq, wq
+        jl .loop0
+
+    .end:
+    mov    eax, xd
+    RET
+
+INIT_XMM sse4
+cglobal overlay_row_20, 6, 8, 8, 0, d, da, s, a, w, al, r, x
+    xor          xq, xq
+    movsxdifnidn wq, wd
+    sub          wq, 1
+    mov          rq, wq
+    and          rq, mmsize/2 - 1
+    cmp          wq, mmsize/2
+    jl .end
+    sub          wq, rq
+    mov         daq, aq
+    add         daq, alq
+    mova         m3, [pw_255]
+    mova         m4, [pw_128]
+    mova         m5, [pw_257]
+    mova         m7, [pw_65280]
+    .loop0:
+        pmovzxbw    m0, [sq+xq]
+        movu        m2, [aq+2*xq]
+        pand        m2, m3
+        movu        m6, [aq+2*xq]
+        pand        m6, m7
+        psrlw       m6, 8
+        paddw       m2, m6
+        movu        m6, [daq+2*xq]
+        pand        m6, m3
+        paddw       m2, m6
+        movu        m6, [daq+2*xq]
+        pand        m6, m7
+        psrlw       m6, 8
+        paddw       m2, m6
+        psrlw       m2, 2
+        pmovzxbw    m1, [dq+xq]
+        pmullw      m0, m2
+        pxor        m2, m3
+        pmullw      m1, m2
+        paddw       m0, m4
+        paddw       m0, m1
+        pmulhuw     m0, m5
+        packuswb    m0, m0
+        movq   [dq+xq], m0
+        add         xq, mmsize/2
+        cmp         xq, wq
+        jl .loop0
+
+    .end:
+    mov    eax, xd
+    RET
diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
new file mode 100644
index 0000000000..865fd035f6
--- /dev/null
+++ b/libavfilter/x86/vf_overlay_init.c
@@ -0,0 +1,63 @@ 
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_overlay.h"
+
+int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+                           int w, ptrdiff_t alinesize);
+
+int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+                           int w, ptrdiff_t alinesize);
+
+int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+                           int w, ptrdiff_t alinesize);
+
+av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
+        (format == OVERLAY_FORMAT_YUV444 ||
+         format == OVERLAY_FORMAT_GBRP) &&
+        alpha_format == 0 && main_has_alpha == 0) {
+        s->blend_row[0] = ff_overlay_row_44_sse4;
+        s->blend_row[1] = ff_overlay_row_44_sse4;
+        s->blend_row[2] = ff_overlay_row_44_sse4;
+    }
+
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
+        (format == OVERLAY_FORMAT_YUV420) &&
+        alpha_format == 0 && main_has_alpha == 0) {
+        s->blend_row[0] = ff_overlay_row_44_sse4;
+        s->blend_row[1] = ff_overlay_row_20_sse4;
+        s->blend_row[2] = ff_overlay_row_20_sse4;
+    }
+
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
+        (format == OVERLAY_FORMAT_YUV422) &&
+        alpha_format == 0 && main_has_alpha == 0) {
+        s->blend_row[0] = ff_overlay_row_44_sse4;
+        s->blend_row[1] = ff_overlay_row_22_sse4;
+        s->blend_row[2] = ff_overlay_row_22_sse4;
+    }
+}