diff mbox

[FFmpeg-devel] avfilter: add hflip x86 SIMD

Message ID 20171203192258.28187-1-onemda@gmail.com
State Superseded
Headers show

Commit Message

Paul B Mahol Dec. 3, 2017, 7:22 p.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/hflip.h             |  38 ++++++++++++
 libavfilter/vf_hflip.c          | 133 ++++++++++++++++++++++++++--------------
 libavfilter/x86/Makefile        |   2 +
 libavfilter/x86/vf_hflip.asm    |  98 +++++++++++++++++++++++++++++
 libavfilter/x86/vf_hflip_init.c |  41 +++++++++++++
 5 files changed, 265 insertions(+), 47 deletions(-)
 create mode 100644 libavfilter/hflip.h
 create mode 100644 libavfilter/x86/vf_hflip.asm
 create mode 100644 libavfilter/x86/vf_hflip_init.c

Comments

Martin Vignali Dec. 3, 2017, 7:48 p.m. UTC | #1
Maybe the problem come from the skip part :

+INIT_XMM ssse3
> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
> +    mova    m0, [pb_flip_byte]
> +    mov     xq, 0
> +    mov     wd, dword wm
> +    sub     wq, 2 * mmsize
> +    cmp     wq, mmsize
> +    jl .skip
> +
> +    .loop0:
> +        neg     xq
> +        movu    m1, [srcq + xq -     mmsize + 1]
> +        movu    m2, [srcq + xq - 2 * mmsize + 1]
> +        pshufb  m1, m0
> +        pshufb  m2, m0
> +        neg     xq
> +        movu    [dstq + xq         ], m1
> +        movu    [dstq + xq + mmsize], m2
> +        add     xq, mmsize * 2
> +        cmp     xq, wq
> +        jl .loop0
> +
> +.skip:
> +    add     wq, 2 * mmsize
>

==> use xq instead of wq ?


> +    .loop1:
> +        neg    xq
> +        mov    vb, [srcq + xq]
> +        neg    xq
> +        mov    [dstq + xq], vb
> +        add    xq, 1
> +        cmp    xq, wq
> +        jl .loop1
> +RET
> +
> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
> +    mova    m0, [pb_flip_short]
> +    mov     xq, 0
> +    mov     wd, dword wm
> +    add     wq, wq
> +    sub     wq, 2 * mmsize
> +    cmp     wq, mmsize
> +    jl .skip
> +
> +    .loop0:
> +        neg     xq
> +        movu    m1, [srcq + xq -     mmsize + 2]
> +        movu    m2, [srcq + xq - 2 * mmsize + 2]
> +        pshufb  m1, m0
> +        pshufb  m2, m0
> +        neg     xq
> +        movu    [dstq + xq         ], m1
> +        movu    [dstq + xq + mmsize], m2
> +        add     xq, mmsize
> +        cmp     xq, wq
> +        jl .loop0
> +
> +.skip:
> +    add     wq, 2 * mmsize
>


==> same here ?


+    .loop1:
> +        neg    xq
> +        mov    vw, [srcq + xq]
> +        neg    xq
> +        mov    [dstq + xq], vw
> +        add    xq, 2
> +        cmp    xq, wq
> +        jl .loop1
> +RET
>
Paul B Mahol Dec. 3, 2017, 7:57 p.m. UTC | #2
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
> Maybe the problem come from the skip part :
>
> +INIT_XMM ssse3
>> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
>> +    mova    m0, [pb_flip_byte]
>> +    mov     xq, 0
>> +    mov     wd, dword wm
>> +    sub     wq, 2 * mmsize
>> +    cmp     wq, mmsize
>> +    jl .skip
>> +
>> +    .loop0:
>> +        neg     xq
>> +        movu    m1, [srcq + xq -     mmsize + 1]
>> +        movu    m2, [srcq + xq - 2 * mmsize + 1]
>> +        pshufb  m1, m0
>> +        pshufb  m2, m0
>> +        neg     xq
>> +        movu    [dstq + xq         ], m1
>> +        movu    [dstq + xq + mmsize], m2
>> +        add     xq, mmsize * 2
>> +        cmp     xq, wq
>> +        jl .loop0
>> +
>> +.skip:
>> +    add     wq, 2 * mmsize
>>
>
> ==> use xq instead of wq ?

Nope.

>
>
>> +    .loop1:
>> +        neg    xq
>> +        mov    vb, [srcq + xq]
>> +        neg    xq
>> +        mov    [dstq + xq], vb
>> +        add    xq, 1
>> +        cmp    xq, wq
>> +        jl .loop1
>> +RET
>> +
>> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
>> +    mova    m0, [pb_flip_short]
>> +    mov     xq, 0
>> +    mov     wd, dword wm
>> +    add     wq, wq
>> +    sub     wq, 2 * mmsize
>> +    cmp     wq, mmsize
>> +    jl .skip
>> +
>> +    .loop0:
>> +        neg     xq
>> +        movu    m1, [srcq + xq -     mmsize + 2]
>> +        movu    m2, [srcq + xq - 2 * mmsize + 2]
>> +        pshufb  m1, m0
>> +        pshufb  m2, m0
>> +        neg     xq
>> +        movu    [dstq + xq         ], m1
>> +        movu    [dstq + xq + mmsize], m2
>> +        add     xq, mmsize
>> +        cmp     xq, wq
>> +        jl .loop0
>> +
>> +.skip:
>> +    add     wq, 2 * mmsize
>>
>
>
> ==> same here ?

Nope, This is for case when width is not multiple of mmsize.
Paul B Mahol Dec. 3, 2017, 8:13 p.m. UTC | #3
On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote:
> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
>> Maybe the problem come from the skip part :
>>
>> +INIT_XMM ssse3
>>> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
>>> +    mova    m0, [pb_flip_byte]
>>> +    mov     xq, 0
>>> +    mov     wd, dword wm
>>> +    sub     wq, 2 * mmsize
>>> +    cmp     wq, mmsize
>>> +    jl .skip
>>> +
>>> +    .loop0:
>>> +        neg     xq
>>> +        movu    m1, [srcq + xq -     mmsize + 1]
>>> +        movu    m2, [srcq + xq - 2 * mmsize + 1]
>>> +        pshufb  m1, m0
>>> +        pshufb  m2, m0
>>> +        neg     xq
>>> +        movu    [dstq + xq         ], m1
>>> +        movu    [dstq + xq + mmsize], m2
>>> +        add     xq, mmsize * 2
>>> +        cmp     xq, wq
>>> +        jl .loop0
>>> +
>>> +.skip:
>>> +    add     wq, 2 * mmsize
>>>
>>
>> ==> use xq instead of wq ?
>
> Nope.
>
>>
>>
>>> +    .loop1:
>>> +        neg    xq
>>> +        mov    vb, [srcq + xq]
>>> +        neg    xq
>>> +        mov    [dstq + xq], vb
>>> +        add    xq, 1
>>> +        cmp    xq, wq
>>> +        jl .loop1
>>> +RET
>>> +
>>> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
>>> +    mova    m0, [pb_flip_short]
>>> +    mov     xq, 0
>>> +    mov     wd, dword wm
>>> +    add     wq, wq
>>> +    sub     wq, 2 * mmsize
>>> +    cmp     wq, mmsize
>>> +    jl .skip
>>> +
>>> +    .loop0:
>>> +        neg     xq
>>> +        movu    m1, [srcq + xq -     mmsize + 2]
>>> +        movu    m2, [srcq + xq - 2 * mmsize + 2]
>>> +        pshufb  m1, m0
>>> +        pshufb  m2, m0
>>> +        neg     xq
>>> +        movu    [dstq + xq         ], m1
>>> +        movu    [dstq + xq + mmsize], m2
>>> +        add     xq, mmsize
>>> +        cmp     xq, wq
>>> +        jl .loop0
>>> +
>>> +.skip:
>>> +    add     wq, 2 * mmsize
>>>
>>
>>
>> ==> same here ?
>
> Nope, This is for case when width is not multiple of mmsize.
>

Can I get final verdict? I would like to move to other things.
Martin Vignali Dec. 3, 2017, 8:15 p.m. UTC | #4
I modify the checkasm test, to test various width

if (check_func(s.flip_line[0], "hflip_%s", report_name)) {
        for (i = 1; i < w; i++) {
            call_ref(src, dst_ref, i);
            call_new(src, dst_new, i);
            if (memcmp(dst_ref, dst_new, WIDTH)) {
                printf("FAIL : W = %d\n", i);
                fail();
            }
        }
        bench_new(src, dst_new, WIDTH);
    }


This asm seems to be ok (same idea for the hflip_short version)
hflip_byte_c: 28.4
hflip_byte_ssse3: 23.7
hflip_short_c: 275.9
hflip_short_ssse3: 65.2


INIT_XMM ssse3
cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
    mova    m0, [pb_flip_byte]
    mov     xq, 0
    mov     wd, dword wm
    sub     wq, 2 * mmsize
    ;cmp     wq, mmsize ; <==== Doesn't seems to be need
    jl .skip

    .loop0:
        neg     xq
        movu    m1, [srcq + xq -     mmsize + 1]
        movu    m2, [srcq + xq - 2 * mmsize + 1]
        pshufb  m1, m0
        pshufb  m2, m0
        neg     xq
        movu    [dstq + xq         ], m1
        movu    [dstq + xq + mmsize], m2
        add     xq, mmsize * 2
        cmp     xq, wq
        jl .loop0

    cmp xq, wq ;<====
    je .end ;<====


   sub xq, mmsize *2 ;<====
   jmp .loop1 ;<====




.skip:
    add     wq, 2 * mmsize
    .loop1:
        neg    xq
        mov    vb, [srcq + xq]
        neg    xq
        mov    [dstq + xq], vb
        add    xq, 1
        cmp    xq, wq
        jl .loop1
.end:
RET
diff mbox

Patch

diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
new file mode 100644
index 0000000000..138380427c
--- /dev/null
+++ b/libavfilter/hflip.h
@@ -0,0 +1,38 @@ 
+/*
+ * Copyright (c) 2007 Benoit Fouet
+ * Copyright (c) 2010 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_HFLIP_H
+#define AVFILTER_HFLIP_H
+
+#include "avfilter.h"
+
+typedef struct FlipContext {
+    const AVClass *class;
+    int max_step[4];    ///< max pixel step for each plane, expressed as a number of bytes
+    int planewidth[4];  ///< width of each plane
+    int planeheight[4]; ///< height of each plane
+
+    void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
+} FlipContext;
+
+void ff_hflip_init_x86(FlipContext *s, int step[4]);
+
+#endif /* AVFILTER_HFLIP_H */
diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
index cf20c193f7..030015df0a 100644
--- a/libavfilter/vf_hflip.c
+++ b/libavfilter/vf_hflip.c
@@ -29,6 +29,7 @@ 
 #include "libavutil/opt.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "hflip.h"
 #include "internal.h"
 #include "video.h"
 #include "libavutil/pixdesc.h"
@@ -36,13 +37,6 @@ 
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 
-typedef struct FlipContext {
-    const AVClass *class;
-    int max_step[4];    ///< max pixel step for each plane, expressed as a number of bytes
-    int planewidth[4];  ///< width of each plane
-    int planeheight[4]; ///< height of each plane
-} FlipContext;
-
 static const AVOption hflip_options[] = {
     { NULL }
 };
@@ -67,12 +61,77 @@  static int query_formats(AVFilterContext *ctx)
     return ff_set_common_formats(ctx, pix_fmts);
 }
 
+static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
+{
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
+static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+    const uint16_t *src = (const uint16_t *)ssrc;
+    uint16_t *dst = (uint16_t *)ddst;
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
+static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+    const uint32_t *src = (const uint32_t *)ssrc;
+    uint32_t *dst = (uint32_t *)ddst;
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
+static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w)
+{
+    const uint8_t *in  = src;
+    uint8_t *out = dst;
+    int j;
+
+    for (j = 0; j < w; j++, out += 3, in -= 3) {
+        int32_t v = AV_RB24(in);
+
+        AV_WB24(out, v);
+    }
+}
+
+static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w)
+{
+    const uint8_t *in  = src;
+    uint8_t *out = dst;
+    int j;
+
+    for (j = 0; j < w; j++, out += 6, in -= 6) {
+        int64_t v = AV_RB48(in);
+
+        AV_WB48(out, v);
+    }
+}
+
+static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+    const uint64_t *src = (const uint64_t *)ssrc;
+    uint64_t *dst = (uint64_t *)ddst;
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
 static int config_props(AVFilterLink *inlink)
 {
     FlipContext *s = inlink->dst->priv;
     const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
     const int hsub = pix_desc->log2_chroma_w;
     const int vsub = pix_desc->log2_chroma_h;
+    int nb_planes, i;
 
     av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
     s->planewidth[0]  = s->planewidth[3]  = inlink->w;
@@ -80,6 +139,24 @@  static int config_props(AVFilterLink *inlink)
     s->planeheight[0] = s->planeheight[3] = inlink->h;
     s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
 
+    nb_planes = av_pix_fmt_count_planes(inlink->format);
+
+    for (i = 0; i < nb_planes; i++) {
+        switch (s->max_step[i]) {
+        case 1: s->flip_line[i] = hflip_byte_c;  break;
+        case 2: s->flip_line[i] = hflip_short_c; break;
+        case 3: s->flip_line[i] = hflip_b24_c;   break;
+        case 4: s->flip_line[i] = hflip_dword_c; break;
+        case 6: s->flip_line[i] = hflip_b48_c;   break;
+        case 8: s->flip_line[i] = hflip_qword_c; break;
+        default:
+            return AVERROR_BUG;
+        }
+    }
+
+    if (ARCH_X86)
+        ff_hflip_init_x86(s, s->max_step);
+
     return 0;
 }
 
@@ -94,7 +171,7 @@  static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs)
     AVFrame *in = td->in;
     AVFrame *out = td->out;
     uint8_t *inrow, *outrow;
-    int i, j, plane, step;
+    int i, plane, step;
 
     for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
         const int width  = s->planewidth[plane];
@@ -107,45 +184,7 @@  static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs)
         outrow = out->data[plane] + start * out->linesize[plane];
         inrow  = in ->data[plane] + start * in->linesize[plane] + (width - 1) * step;
         for (i = start; i < end; i++) {
-            switch (step) {
-            case 1:
-                for (j = 0; j < width; j++)
-                    outrow[j] = inrow[-j];
-            break;
-
-            case 2:
-            {
-                uint16_t *outrow16 = (uint16_t *)outrow;
-                uint16_t * inrow16 = (uint16_t *) inrow;
-                for (j = 0; j < width; j++)
-                    outrow16[j] = inrow16[-j];
-            }
-            break;
-
-            case 3:
-            {
-                uint8_t *in  =  inrow;
-                uint8_t *out = outrow;
-                for (j = 0; j < width; j++, out += 3, in -= 3) {
-                    int32_t v = AV_RB24(in);
-                    AV_WB24(out, v);
-                }
-            }
-            break;
-
-            case 4:
-            {
-                uint32_t *outrow32 = (uint32_t *)outrow;
-                uint32_t * inrow32 = (uint32_t *) inrow;
-                for (j = 0; j < width; j++)
-                    outrow32[j] = inrow32[-j];
-            }
-            break;
-
-            default:
-                for (j = 0; j < width; j++)
-                    memcpy(outrow + j*step, inrow - j*step, step);
-            }
+            s->flip_line[plane](inrow, outrow, width);
 
             inrow  += in ->linesize[plane];
             outrow += out->linesize[plane];
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index c10f4d5538..2fc5c62644 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -5,6 +5,7 @@  OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
+OBJS-$(CONFIG_HFLIP_FILTER)                  += x86/vf_hflip_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_interlace_init.o
@@ -32,6 +33,7 @@  X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
+X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
 X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
 X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
 X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
new file mode 100644
index 0000000000..6ffc06e5b2
--- /dev/null
+++ b/libavfilter/x86/vf_hflip.asm
@@ -0,0 +1,98 @@ 
+;*****************************************************************************
+;* x86-optimized functions for hflip filter
+;*
+;* Copyright (C) 2017 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
+    mova    m0, [pb_flip_byte]
+    mov     xq, 0
+    mov     wd, dword wm
+    sub     wq, 2 * mmsize
+    cmp     wq, mmsize
+    jl .skip
+
+    .loop0:
+        neg     xq
+        movu    m1, [srcq + xq -     mmsize + 1]
+        movu    m2, [srcq + xq - 2 * mmsize + 1]
+        pshufb  m1, m0
+        pshufb  m2, m0
+        neg     xq
+        movu    [dstq + xq         ], m1
+        movu    [dstq + xq + mmsize], m2
+        add     xq, mmsize * 2
+        cmp     xq, wq
+        jl .loop0
+
+.skip:
+    add     wq, 2 * mmsize
+    .loop1:
+        neg    xq
+        mov    vb, [srcq + xq]
+        neg    xq
+        mov    [dstq + xq], vb
+        add    xq, 1
+        cmp    xq, wq
+        jl .loop1
+RET
+
+cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
+    mova    m0, [pb_flip_short]
+    mov     xq, 0
+    mov     wd, dword wm
+    add     wq, wq
+    sub     wq, 2 * mmsize
+    cmp     wq, mmsize
+    jl .skip
+
+    .loop0:
+        neg     xq
+        movu    m1, [srcq + xq -     mmsize + 2]
+        movu    m2, [srcq + xq - 2 * mmsize + 2]
+        pshufb  m1, m0
+        pshufb  m2, m0
+        neg     xq
+        movu    [dstq + xq         ], m1
+        movu    [dstq + xq + mmsize], m2
+        add     xq, mmsize
+        cmp     xq, wq
+        jl .loop0
+
+.skip:
+    add     wq, 2 * mmsize
+    .loop1:
+        neg    xq
+        mov    vw, [srcq + xq]
+        neg    xq
+        mov    [dstq + xq], vw
+        add    xq, 2
+        cmp    xq, wq
+        jl .loop1
+RET
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
new file mode 100644
index 0000000000..d8eab1f905
--- /dev/null
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -0,0 +1,41 @@ 
+/*
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/hflip.h"
+
+void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
+
+av_cold void ff_hflip_init_x86(FlipContext *s, int step[4])
+{
+    int cpu_flags = av_get_cpu_flags();
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) {
+            s->flip_line[i] = ff_hflip_byte_ssse3;
+        } else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) {
+            s->flip_line[i] = ff_hflip_short_ssse3;
+        }
+    }
+}