diff mbox

[FFmpeg-devel] avfilter/vf_overlay: add x86 SIMD for yuv444 format when main stream has no alpha

Message ID 20180430161740.3688-1-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol April 30, 2018, 4:17 p.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/vf_overlay.c          | 76 ++++++++-----------------------
 libavfilter/vf_overlay.h          | 84 ++++++++++++++++++++++++++++++++++
 libavfilter/x86/Makefile          |  2 +
 libavfilter/x86/vf_overlay.asm    | 94 +++++++++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_overlay_init.c | 39 ++++++++++++++++
 5 files changed, 238 insertions(+), 57 deletions(-)
 create mode 100644 libavfilter/vf_overlay.h
 create mode 100644 libavfilter/x86/vf_overlay.asm
 create mode 100644 libavfilter/x86/vf_overlay_init.c

Comments

Paul B Mahol April 30, 2018, 4:22 p.m. UTC | #1
On 4/30/18, Paul B Mahol <onemda@gmail.com> wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/vf_overlay.c          | 76 ++++++++-----------------------
>  libavfilter/vf_overlay.h          | 84 ++++++++++++++++++++++++++++++++++
>  libavfilter/x86/Makefile          |  2 +
>  libavfilter/x86/vf_overlay.asm    | 94
> +++++++++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_overlay_init.c | 39 ++++++++++++++++
>  5 files changed, 238 insertions(+), 57 deletions(-)
>  create mode 100644 libavfilter/vf_overlay.h
>  create mode 100644 libavfilter/x86/vf_overlay.asm
>  create mode 100644 libavfilter/x86/vf_overlay_init.c
>

Gives ~15% speedup overall with one scenario i tried.
Henrik Gramner April 30, 2018, 6:50 p.m. UTC | #2
On Mon, Apr 30, 2018 at 6:17 PM, Paul B Mahol <onemda@gmail.com> wrote:
> +    .loop0:
> +        movu      m1, [dq + xq]
> +        movu      m2, [aq + xq]
> +        movu      m3, [sq + xq]
> +
> +        pshufb       m1, [pb_b2dw]
> +        pshufb       m2, [pb_b2dw]
> +        pshufb       m3, [pb_b2dw]
> +        mova         m4, [pd_255]
> +        psubd        m4, m2
> +        pmulld       m1, m4
> +        pmulld       m3, m2
> +        paddd        m1, m3
> +        paddd        m1, [pd_128]
> +        pmulld       m1, [pd_257]
> +        psrad        m1, 16
> +        pshufb       m1, [pb_dw2b]
> +        movd    [dq+xq], m1
> +        add          xq, mmsize / 4

Unpacking to dwords seems inefficient when you could do something like
this (untested):

    mova         m3, [pw_255]
    mova         m4, [pw_128]
    mova         m5, [pw_257]
.loop0:
    pmovzxbw     m0, [sq + xq]
    pmovzxbw     m2, [aq + xq]
    pmovzxbw     m1, [dq + xq]
    pmullw       m0, m2
    pxor         m2, m3
    pmullw       m1, m2
    paddw        m0, m4
    paddw        m0, m1
    pmulhuw      m0, m5
    packuswb     m0, m0
    movq    [dq+xq], m0
    add          xq, mmsize / 2

which does twice as much per iteration. Also note that pmulld is slow
on most CPUs.

> +    .loop1:
> +        xor         tq, tq
> +        xor         uq, uq
> +        xor         vq, vq
> +        mov         rd, 255
> +        mov         tb, [aq + xq]
> +        neg         tb
> +        add         rb, tb
> +        mov         ub, [sq + xq]
> +        neg         tb
> +        imul        ud, td
> +        mov         vb, [dq + xq]
> +        imul        rd, vd
> +        add         rd, ud
> +        add         rd, 128
> +        imul        rd, 257
> +        sar         rd, 16
> +        mov  [dq + xq], rb
> +        add         xq, 1
> +        cmp         xq, wq
> +        jl .loop1

Is doing the tail in scalar necessary? E.g. can you pad the buffers so
that reading/writing past the end is OK and just run the SIMD loop?

If that's impossible it'd probably be better to do a separate SIMD
loop and pinsr/pextr input/output pixels depending on the number of
elements left.
Paul B Mahol April 30, 2018, 6:57 p.m. UTC | #3
On 4/30/18, Henrik Gramner <henrik@gramner.com> wrote:
> On Mon, Apr 30, 2018 at 6:17 PM, Paul B Mahol <onemda@gmail.com> wrote:
>> +    .loop0:
>> +        movu      m1, [dq + xq]
>> +        movu      m2, [aq + xq]
>> +        movu      m3, [sq + xq]
>> +
>> +        pshufb       m1, [pb_b2dw]
>> +        pshufb       m2, [pb_b2dw]
>> +        pshufb       m3, [pb_b2dw]
>> +        mova         m4, [pd_255]
>> +        psubd        m4, m2
>> +        pmulld       m1, m4
>> +        pmulld       m3, m2
>> +        paddd        m1, m3
>> +        paddd        m1, [pd_128]
>> +        pmulld       m1, [pd_257]
>> +        psrad        m1, 16
>> +        pshufb       m1, [pb_dw2b]
>> +        movd    [dq+xq], m1
>> +        add          xq, mmsize / 4
>
> Unpacking to dwords seems inefficient when you could do something like
> this (untested):
>
>     mova         m3, [pw_255]
>     mova         m4, [pw_128]
>     mova         m5, [pw_257]
> .loop0:
>     pmovzxbw     m0, [sq + xq]
>     pmovzxbw     m2, [aq + xq]
>     pmovzxbw     m1, [dq + xq]
>     pmullw       m0, m2
>     pxor         m2, m3
>     pmullw       m1, m2
>     paddw        m0, m4
>     paddw        m0, m1
>     pmulhuw      m0, m5
>     packuswb     m0, m0
>     movq    [dq+xq], m0
>     add          xq, mmsize / 2


Will experiment with this.

>
> which does twice as much per iteration. Also note that pmulld is slow
> on most CPUs.

This SIMD is not for CPUs found in museums.

>
>> +    .loop1:
>> +        xor         tq, tq
>> +        xor         uq, uq
>> +        xor         vq, vq
>> +        mov         rd, 255
>> +        mov         tb, [aq + xq]
>> +        neg         tb
>> +        add         rb, tb
>> +        mov         ub, [sq + xq]
>> +        neg         tb
>> +        imul        ud, td
>> +        mov         vb, [dq + xq]
>> +        imul        rd, vd
>> +        add         rd, ud
>> +        add         rd, 128
>> +        imul        rd, 257
>> +        sar         rd, 16
>> +        mov  [dq + xq], rb
>> +        add         xq, 1
>> +        cmp         xq, wq
>> +        jl .loop1
>
> Is doing the tail in scalar necessary? E.g. can you pad the buffers so
> that reading/writing past the end is OK and just run the SIMD loop?

Overlay does not operate that way, you can overlay 1 pixel onto hd720 frame.
Do you get it now?

>
> If that's impossible it'd probably be better to do a separate SIMD
> loop and pinsr/pextr input/output pixels depending on the number of
> elements left.

That seems too complicated.
James Almer April 30, 2018, 7:05 p.m. UTC | #4
On 4/30/2018 3:57 PM, Paul B Mahol wrote:
> On 4/30/18, Henrik Gramner <henrik@gramner.com> wrote:
>> On Mon, Apr 30, 2018 at 6:17 PM, Paul B Mahol <onemda@gmail.com> wrote:
>>> +    .loop0:
>>> +        movu      m1, [dq + xq]
>>> +        movu      m2, [aq + xq]
>>> +        movu      m3, [sq + xq]
>>> +
>>> +        pshufb       m1, [pb_b2dw]
>>> +        pshufb       m2, [pb_b2dw]
>>> +        pshufb       m3, [pb_b2dw]
>>> +        mova         m4, [pd_255]
>>> +        psubd        m4, m2
>>> +        pmulld       m1, m4
>>> +        pmulld       m3, m2
>>> +        paddd        m1, m3
>>> +        paddd        m1, [pd_128]
>>> +        pmulld       m1, [pd_257]
>>> +        psrad        m1, 16
>>> +        pshufb       m1, [pb_dw2b]
>>> +        movd    [dq+xq], m1
>>> +        add          xq, mmsize / 4
>>
>> Unpacking to dwords seems inefficient when you could do something like
>> this (untested):
>>
>>     mova         m3, [pw_255]
>>     mova         m4, [pw_128]
>>     mova         m5, [pw_257]
>> .loop0:
>>     pmovzxbw     m0, [sq + xq]
>>     pmovzxbw     m2, [aq + xq]
>>     pmovzxbw     m1, [dq + xq]
>>     pmullw       m0, m2
>>     pxor         m2, m3
>>     pmullw       m1, m2
>>     paddw        m0, m4
>>     paddw        m0, m1
>>     pmulhuw      m0, m5
>>     packuswb     m0, m0
>>     movq    [dq+xq], m0
>>     add          xq, mmsize / 2
> 
> 
> Will experiment with this.
> 
>>
>> which does twice as much per iteration. Also note that pmulld is slow
>> on most CPUs.
> 
> This SIMD is not for CPUs found in museums.

pmulld is sse4.1 and no museum CPU supports it.

It's the slowest multiplication instruction by far on every CPU that
supports it (in some cases twice as slow as pmullw, pmuldq, etc), and if
it can be avoided, it absolutely should.

> 
>>
>>> +    .loop1:
>>> +        xor         tq, tq
>>> +        xor         uq, uq
>>> +        xor         vq, vq
>>> +        mov         rd, 255
>>> +        mov         tb, [aq + xq]
>>> +        neg         tb
>>> +        add         rb, tb
>>> +        mov         ub, [sq + xq]
>>> +        neg         tb
>>> +        imul        ud, td
>>> +        mov         vb, [dq + xq]
>>> +        imul        rd, vd
>>> +        add         rd, ud
>>> +        add         rd, 128
>>> +        imul        rd, 257
>>> +        sar         rd, 16
>>> +        mov  [dq + xq], rb
>>> +        add         xq, 1
>>> +        cmp         xq, wq
>>> +        jl .loop1
>>
>> Is doing the tail in scalar necessary? E.g. can you pad the buffers so
>> that reading/writing past the end is OK and just run the SIMD loop?
> 
> Overlay does not operate that way, you can overlay 1 pixel onto hd720 frame.
> Do you get it now?
> 
>>
>> If that's impossible it'd probably be better to do a separate SIMD
>> loop and pinsr/pextr input/output pixels depending on the number of
>> elements left.
> 
> That seems too complicated.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
diff mbox

Patch

diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 8c1895cca4..81522d31a4 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -39,6 +39,7 @@ 
 #include "drawutils.h"
 #include "framesync.h"
 #include "video.h"
+#include "vf_overlay.h"
 
 typedef struct ThreadData {
     AVFrame *dst, *src;
@@ -59,21 +60,6 @@  static const char *const var_names[] = {
     NULL
 };
 
-enum var_name {
-    VAR_MAIN_W,    VAR_MW,
-    VAR_MAIN_H,    VAR_MH,
-    VAR_OVERLAY_W, VAR_OW,
-    VAR_OVERLAY_H, VAR_OH,
-    VAR_HSUB,
-    VAR_VSUB,
-    VAR_X,
-    VAR_Y,
-    VAR_N,
-    VAR_POS,
-    VAR_T,
-    VAR_VARS_NB
-};
-
 #define MAIN    0
 #define OVERLAY 1
 
@@ -92,45 +78,6 @@  enum EvalMode {
     EVAL_MODE_NB
 };
 
-enum OverlayFormat {
-    OVERLAY_FORMAT_YUV420,
-    OVERLAY_FORMAT_YUV422,
-    OVERLAY_FORMAT_YUV444,
-    OVERLAY_FORMAT_RGB,
-    OVERLAY_FORMAT_GBRP,
-    OVERLAY_FORMAT_AUTO,
-    OVERLAY_FORMAT_NB
-};
-
-typedef struct OverlayContext {
-    const AVClass *class;
-    int x, y;                   ///< position of overlaid picture
-
-    uint8_t main_is_packed_rgb;
-    uint8_t main_rgba_map[4];
-    uint8_t main_has_alpha;
-    uint8_t overlay_is_packed_rgb;
-    uint8_t overlay_rgba_map[4];
-    uint8_t overlay_has_alpha;
-    int format;                 ///< OverlayFormat
-    int alpha_format;
-    int eval_mode;              ///< EvalMode
-
-    FFFrameSync fs;
-
-    int main_pix_step[4];       ///< steps per pixel for each plane of the main output
-    int overlay_pix_step[4];    ///< steps per pixel for each plane of the overlay
-    int hsub, vsub;             ///< chroma subsampling values
-    const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
-
-    double var_values[VAR_VARS_NB];
-    char *x_expr, *y_expr;
-
-    AVExpr *x_pexpr, *y_pexpr;
-
-    int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-} OverlayContext;
-
 static av_cold void uninit(AVFilterContext *ctx)
 {
     OverlayContext *s = ctx->priv;
@@ -509,6 +456,7 @@  static av_always_inline void blend_plane(AVFilterContext *ctx,
                                          int jobnr,
                                          int nb_jobs)
 {
+    OverlayContext *octx = ctx->priv;
     int src_wp = AV_CEIL_RSHIFT(src_w, hsub);
     int src_hp = AV_CEIL_RSHIFT(src_h, vsub);
     int dst_wp = AV_CEIL_RSHIFT(dst_w, hsub);
@@ -538,8 +486,17 @@  static av_always_inline void blend_plane(AVFilterContext *ctx,
         s = sp + k;
         a = ap + (k<<hsub);
         da = dap + ((xp+k) << hsub);
-
-        for (kmax = FFMIN(-xp + dst_wp, src_wp); k < kmax; k++) {
+        kmax = FFMIN(-xp + dst_wp, src_wp);
+
+        if (octx->blend_row) {
+            octx->blend_row(d, da, s, a, kmax - k, k, j, src_wp, src_hp);
+            dp += dst->linesize[dst_plane];
+            sp += src->linesize[i];
+            ap += (1 << vsub) * src->linesize[3];
+            dap += (1 << vsub) * dst->linesize[3];
+            continue;
+        }
+        for (; k < kmax; k++) {
             int alpha_v, alpha_h, alpha;
 
             // average alpha for color components, improve quality
@@ -916,7 +873,7 @@  static int config_input_main(AVFilterLink *inlink)
     }
 
     if (!s->alpha_format)
-        return 0;
+        goto end;
 
     switch (s->format) {
     case OVERLAY_FORMAT_YUV420:
@@ -960,6 +917,11 @@  static int config_input_main(AVFilterLink *inlink)
         }
         break;
     }
+
+end:
+    if (ARCH_X86)
+        ff_overlay_init_x86(s, s->format, s->alpha_format, s->main_has_alpha);
+
     return 0;
 }
 
diff --git a/libavfilter/vf_overlay.h b/libavfilter/vf_overlay.h
new file mode 100644
index 0000000000..8eb91d9a34
--- /dev/null
+++ b/libavfilter/vf_overlay.h
@@ -0,0 +1,84 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_OVERLAY_H
+#define AVFILTER_OVERLAY_H
+
+#include "libavutil/eval.h"
+#include "libavutil/pixdesc.h"
+#include "framesync.h"
+#include "avfilter.h"
+
+enum var_name {
+    VAR_MAIN_W,    VAR_MW,
+    VAR_MAIN_H,    VAR_MH,
+    VAR_OVERLAY_W, VAR_OW,
+    VAR_OVERLAY_H, VAR_OH,
+    VAR_HSUB,
+    VAR_VSUB,
+    VAR_X,
+    VAR_Y,
+    VAR_N,
+    VAR_POS,
+    VAR_T,
+    VAR_VARS_NB
+};
+
+enum OverlayFormat {
+    OVERLAY_FORMAT_YUV420,
+    OVERLAY_FORMAT_YUV422,
+    OVERLAY_FORMAT_YUV444,
+    OVERLAY_FORMAT_RGB,
+    OVERLAY_FORMAT_GBRP,
+    OVERLAY_FORMAT_AUTO,
+    OVERLAY_FORMAT_NB
+};
+
+typedef struct OverlayContext {
+    const AVClass *class;
+    int x, y;                   ///< position of overlaid picture
+
+    uint8_t main_is_packed_rgb;
+    uint8_t main_rgba_map[4];
+    uint8_t main_has_alpha;
+    uint8_t overlay_is_packed_rgb;
+    uint8_t overlay_rgba_map[4];
+    uint8_t overlay_has_alpha;
+    int format;                 ///< OverlayFormat
+    int alpha_format;
+    int eval_mode;              ///< EvalMode
+
+    FFFrameSync fs;
+
+    int main_pix_step[4];       ///< steps per pixel for each plane of the main output
+    int overlay_pix_step[4];    ///< steps per pixel for each plane of the overlay
+    int hsub, vsub;             ///< chroma subsampling values
+    const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
+
+    double var_values[VAR_VARS_NB];
+    char *x_expr, *y_expr;
+
+    AVExpr *x_pexpr, *y_pexpr;
+
+    void (*blend_row)(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, int w, int x, int y, int src_w, int src_h);
+    int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+} OverlayContext;
+
+void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha);
+
+#endif /* AVFILTER_OVERLAY_H */
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index f60de3b73b..b484c8bd1c 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -13,6 +13,7 @@  OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
 OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
+OBJS-$(CONFIG_OVERLAY_FILTER)                += x86/vf_overlay_init.o
 OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
 OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
@@ -41,6 +42,7 @@  X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
 X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
 X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
 X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
+X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
 X86ASM-OBJS-$(CONFIG_PP7_FILTER)             += x86/vf_pp7.o
 X86ASM-OBJS-$(CONFIG_PSNR_FILTER)            += x86/vf_psnr.o
 X86ASM-OBJS-$(CONFIG_PULLUP_FILTER)          += x86/vf_pullup.o
diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
new file mode 100644
index 0000000000..41f74fe946
--- /dev/null
+++ b/libavfilter/x86/vf_overlay.asm
@@ -0,0 +1,94 @@ 
+;*****************************************************************************
+;* x86-optimized functions for overlay filter
+;*
+;* Copyright (C) 2018 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_128: times  4 dd 128
+pd_255: times  4 dd 255
+pd_257: times  4 dd 257
+pb_b2dw: db 0,-1,-1,-1, 1,-1,-1,-1, 2,-1,-1,-1, 3,-1,-1,-1
+pb_dw2b: db 0, 4, 8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+
+SECTION .text
+
+INIT_XMM sse4
+cglobal overlay_row_yuv444, 9, 14, 5, 0, d, da, s, a, w, x, y, src_w, src_h, r, x, t, u, v
+    xor          xq, xq
+    movsxdifnidn wq, wd
+    mov          rq, wq
+    and          rq, mmsize/4 - 1
+    cmp          wq, mmsize/4
+    jl .loop1
+    pxor         m0, m0
+    sub          wq, rq
+    .loop0:
+        movu      m1, [dq + xq]
+        movu      m2, [aq + xq]
+        movu      m3, [sq + xq]
+
+        pshufb       m1, [pb_b2dw]
+        pshufb       m2, [pb_b2dw]
+        pshufb       m3, [pb_b2dw]
+        mova         m4, [pd_255]
+        psubd        m4, m2
+        pmulld       m1, m4
+        pmulld       m3, m2
+        paddd        m1, m3
+        paddd        m1, [pd_128]
+        pmulld       m1, [pd_257]
+        psrad        m1, 16
+        pshufb       m1, [pb_dw2b]
+        movd    [dq+xq], m1
+
+        add          xq, mmsize / 4
+        cmp          xq, wq
+        jl .loop0
+
+    cmp          rq, 0
+    je .end
+    add          wq, rq
+
+    .loop1:
+        xor         tq, tq
+        xor         uq, uq
+        xor         vq, vq
+        mov         rd, 255
+        mov         tb, [aq + xq]
+        neg         tb
+        add         rb, tb
+        mov         ub, [sq + xq]
+        neg         tb
+        imul        ud, td
+        mov         vb, [dq + xq]
+        imul        rd, vd
+        add         rd, ud
+        add         rd, 128
+        imul        rd, 257
+        sar         rd, 16
+        mov  [dq + xq], rb
+        add         xq, 1
+        cmp         xq, wq
+        jl .loop1
+    .end:
+        RET
diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
new file mode 100644
index 0000000000..f57c850a30
--- /dev/null
+++ b/libavfilter/x86/vf_overlay_init.c
@@ -0,0 +1,39 @@ 
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_overlay.h"
+
+void ff_overlay_row_yuv444_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+                                int w, int x, int y, int src_w, int src_h);
+
+av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
+        (format == OVERLAY_FORMAT_YUV444 ||
+         format == OVERLAY_FORMAT_GBRP) &&
+        alpha_format == 0 && main_has_alpha == 0) {
+        s->blend_row = ff_overlay_row_yuv444_sse4;
+    }
+}