diff mbox

[FFmpeg-devel] avfilter: add hflip x86 SIMD

Message ID 20171202115856.340-1-onemda@gmail.com
State Superseded
Headers show

Commit Message

Paul B Mahol Dec. 2, 2017, 11:58 a.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/hflip.h             |  38 ++++++++++++
 libavfilter/vf_hflip.c          | 131 ++++++++++++++++++++++++++--------------
 libavfilter/x86/Makefile        |   2 +
 libavfilter/x86/vf_hflip.asm    |  92 ++++++++++++++++++++++++++++
 libavfilter/x86/vf_hflip_init.c |  41 +++++++++++++
 5 files changed, 257 insertions(+), 47 deletions(-)
 create mode 100644 libavfilter/hflip.h
 create mode 100644 libavfilter/x86/vf_hflip.asm
 create mode 100644 libavfilter/x86/vf_hflip_init.c

Comments

Martin Vignali Dec. 2, 2017, 1:49 p.m. UTC | #1
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pb_flip_byte:  times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
> +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +
>

times 16 ?

Martin
Paul B Mahol Dec. 2, 2017, 1:51 p.m. UTC | #2
On 12/2/17, Martin Vignali <martin.vignali@gmail.com> wrote:
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION_RODATA
>> +
>> +pb_flip_byte:  times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
>> +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
>> +
>>
>
> times 16 ?

Removed.
Martin Vignali Dec. 3, 2017, 4:41 p.m. UTC | #3
Hello,

Maybe you can use a macro for byte and short version,
only few lines are different in each version

Martin
Paul B Mahol Dec. 3, 2017, 4:46 p.m. UTC | #4
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
> Hello,
>
> Maybe you can use a macro for byte and short version,
> only few lines are different in each version

Sure, feel free to send patches.

I'm not very macro proficient.
Martin Vignali Dec. 3, 2017, 4:52 p.m. UTC | #5
2017-12-03 17:46 GMT+01:00 Paul B Mahol <onemda@gmail.com>:

> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
> > Hello,
> >
> > Maybe you can use a macro for byte and short version,
> > only few lines are different in each version
>
> Sure, feel free to send patches.
>
> I'm not very macro proficient.
>

Ok, i will take a look.

Martin
Martin Vignali Dec. 3, 2017, 6:09 p.m. UTC | #6
> 2017-12-03 17:46 GMT+01:00 Paul B Mahol <onemda@gmail.com>:
>
>> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
>> > Hello,
>> >
>> > Maybe you can use a macro for byte and short version,
>> > only few lines are different in each version
>>
>> Sure, feel free to send patches.
>>
>> I'm not very macro proficient.
>>
>
> Ok, i will take a look.
>
> Martin
>

I write a basic checkasm test. Seems like the byte version is slower than c

hflip_byte_c: 31.8
hflip_byte_ssse3: 108.1
hflip_short_c: 300.1
hflip_short_ssse3: 139.8

(checkasm patch in attach if you want to test)

Martin
James Almer Dec. 3, 2017, 6:41 p.m. UTC | #7
On 12/3/2017 3:09 PM, Martin Vignali wrote:
>> 2017-12-03 17:46 GMT+01:00 Paul B Mahol <onemda@gmail.com>:
>>
>>> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
>>>> Hello,
>>>>
>>>> Maybe you can use a macro for byte and short version,
>>>> only few lines are different in each version
>>>
>>> Sure, feel free to send patches.
>>>
>>> I'm not very macro proficient.
>>>
>>
>> Ok, i will take a look.
>>
>> Martin
>>
> 
> I write a basic checkasm test. Seems like the byte version is slower than c
> 
> hflip_byte_c: 31.8
> hflip_byte_ssse3: 108.1
> hflip_short_c: 300.1
> hflip_short_ssse3: 139.8
> 
> (checkasm patch in attach if you want to test)
> 
> Martin

$ tests/checkasm/checkasm.exe --test=vf_hflip --bench
benchmarking with native FFmpeg timers
nop: 32.0
hflip_byte_c: 362.0
hflip_byte_ssse3: 96.0
hflip_short_c: 374.0
hflip_short_ssse3: 121.0

Guess your compiler is really good at optimizing this code, or something
funny is going on.
Can you post a disassembly of hflip_byte_c?
Martin Vignali Dec. 3, 2017, 6:55 p.m. UTC | #8
> Can you post a disassembly of hflip_byte_c?
>
>
> in O1 : clang -S -O1 test_asm_gen.c

    .section    __TEXT,__text,regular,pure_instructions
    .macosx_version_min 10, 12
    .globl    _hflip_byte_c
    .p2align    4, 0x90
_hflip_byte_c:                          ## @hflip_byte_c
    .cfi_startproc
## BB#0:
    pushq    %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
    testl    %edx, %edx
    jle    LBB0_3
## BB#1:
    movl    %edx, %eax
    .p2align    4, 0x90
LBB0_2:                                 ## =>This Inner Loop Header: Depth=1
    movzbl    (%rdi), %ecx
    movb    %cl, (%rsi)
    decq    %rdi
    incq    %rsi
    decq    %rax
    jne    LBB0_2
LBB0_3:
    popq    %rbp
    retq
    .cfi_endproc


.subsections_via_symbols






in O2 or O3 : clang -S -O3 test_asm_gen.c

If i correctly understand, same idea than paul's patch
but processing two xmm in the main loop

    .section    __TEXT,__text,regular,pure_instructions
    .macosx_version_min 10, 12
    .section    __TEXT,__literal16,16byte_literals
    .p2align    4
LCPI0_0:
    .byte    15                      ## 0xf
    .byte    14                      ## 0xe
    .byte    13                      ## 0xd
    .byte    12                      ## 0xc
    .byte    11                      ## 0xb
    .byte    10                      ## 0xa
    .byte    9                       ## 0x9
    .byte    8                       ## 0x8
    .byte    7                       ## 0x7
    .byte    6                       ## 0x6
    .byte    5                       ## 0x5
    .byte    4                       ## 0x4
    .byte    3                       ## 0x3
    .byte    2                       ## 0x2
    .byte    1                       ## 0x1
    .byte    0                       ## 0x0
    .section    __TEXT,__text,regular,pure_instructions
    .globl    _hflip_byte_c
    .p2align    4, 0x90
_hflip_byte_c:                          ## @hflip_byte_c
    .cfi_startproc
## BB#0:
    pushq    %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
                                        ## kill: %EDX<def> %EDX<kill>
%RDX<def>
    testl    %edx, %edx
    jle    LBB0_17
## BB#1:
    movl    %edx, %r8d
    cmpl    $32, %edx
    jae    LBB0_3
## BB#2:
    xorl    %r11d, %r11d
    jmp    LBB0_11
LBB0_3:
    andl    $31, %edx
    movq    %r8, %r11
    subq    %rdx, %r11
    je    LBB0_7
## BB#4:
    leaq    1(%rdi), %rax
    cmpq    %rsi, %rax
    jbe    LBB0_8
## BB#5:
    leaq    (%rsi,%r8), %r9
    movl    $1, %eax
    subq    %r8, %rax
    addq    %rdi, %rax
    cmpq    %r9, %rax
    jae    LBB0_8
LBB0_7:
    xorl    %r11d, %r11d
    jmp    LBB0_11
LBB0_8:
    leaq    -15(%rdi), %r9
    leaq    16(%rsi), %rax
    movdqa    LCPI0_0(%rip), %xmm0    ## xmm0 =
[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    movq    %r11, %r10
    .p2align    4, 0x90
LBB0_9:                                 ## =>This Inner Loop Header: Depth=1
    movdqu    -16(%r9), %xmm1
    movdqu    (%r9), %xmm2
    pshufb    %xmm0, %xmm2
    pshufb    %xmm0, %xmm1
    movdqu    %xmm2, -16(%rax)
    movdqu    %xmm1, (%rax)
    addq    $-32, %r9
    addq    $32, %rax
    addq    $-32, %r10
    jne    LBB0_9
## BB#10:
    testl    %edx, %edx
    je    LBB0_17
LBB0_11:
    movl    %r8d, %eax
    subl    %r11d, %eax
    leaq    -1(%r8), %r9
    subq    %r11, %r9
    andq    $3, %rax
    je    LBB0_14
## BB#12:
    movq    %rdi, %rdx
    subq    %r11, %rdx
    negq    %rax
    .p2align    4, 0x90
LBB0_13:                                ## =>This Inner Loop Header: Depth=1
    movzbl    (%rdx), %ecx
    movb    %cl, (%rsi,%r11)
    incq    %r11
    decq    %rdx
    incq    %rax
    jne    LBB0_13
LBB0_14:
    cmpq    $3, %r9
    jb    LBB0_17
## BB#15:
    subq    %r11, %r8
    subq    %r11, %rdi
    leaq    3(%rsi,%r11), %rax
    .p2align    4, 0x90
LBB0_16:                                ## =>This Inner Loop Header: Depth=1
    movzbl    (%rdi), %ecx
    movb    %cl, -3(%rax)
    movzbl    -1(%rdi), %ecx
    movb    %cl, -2(%rax)
    movzbl    -2(%rdi), %ecx
    movb    %cl, -1(%rax)
    movzbl    -3(%rdi), %ecx
    movb    %cl, (%rax)
    addq    $-4, %rdi
    addq    $4, %rax
    addq    $-4, %r8
    jne    LBB0_16
LBB0_17:
    popq    %rbp
    retq
    .cfi_endproc


.subsections_via_symbols
James Almer Dec. 3, 2017, 7 p.m. UTC | #9
On 12/3/2017 3:55 PM, Martin Vignali wrote:
> in O2 or O3 : clang -S -O3 test_asm_gen.c
> 
> If i correctly understand, same idea than paul's patch
> but processing two xmm in the main loop
> 
>     .section    __TEXT,__text,regular,pure_instructions
>     .macosx_version_min 10, 12
>     .section    __TEXT,__literal16,16byte_literals
>     .p2align    4
> LCPI0_0:
>     .byte    15                      ## 0xf
>     .byte    14                      ## 0xe
>     .byte    13                      ## 0xd
>     .byte    12                      ## 0xc
>     .byte    11                      ## 0xb
>     .byte    10                      ## 0xa
>     .byte    9                       ## 0x9
>     .byte    8                       ## 0x8
>     .byte    7                       ## 0x7
>     .byte    6                       ## 0x6
>     .byte    5                       ## 0x5
>     .byte    4                       ## 0x4
>     .byte    3                       ## 0x3
>     .byte    2                       ## 0x2
>     .byte    1                       ## 0x1
>     .byte    0                       ## 0x0
>     .section    __TEXT,__text,regular,pure_instructions
>     .globl    _hflip_byte_c
>     .p2align    4, 0x90
> _hflip_byte_c:                          ## @hflip_byte_c
>     .cfi_startproc
> ## BB#0:
>     pushq    %rbp
> Ltmp0:
>     .cfi_def_cfa_offset 16
> Ltmp1:
>     .cfi_offset %rbp, -16
>     movq    %rsp, %rbp
> Ltmp2:
>     .cfi_def_cfa_register %rbp
>                                         ## kill: %EDX<def> %EDX<kill>
> %RDX<def>
>     testl    %edx, %edx
>     jle    LBB0_17
> ## BB#1:
>     movl    %edx, %r8d
>     cmpl    $32, %edx
>     jae    LBB0_3
> ## BB#2:
>     xorl    %r11d, %r11d
>     jmp    LBB0_11
> LBB0_3:
>     andl    $31, %edx
>     movq    %r8, %r11
>     subq    %rdx, %r11
>     je    LBB0_7
> ## BB#4:
>     leaq    1(%rdi), %rax
>     cmpq    %rsi, %rax
>     jbe    LBB0_8
> ## BB#5:
>     leaq    (%rsi,%r8), %r9
>     movl    $1, %eax
>     subq    %r8, %rax
>     addq    %rdi, %rax
>     cmpq    %r9, %rax
>     jae    LBB0_8
> LBB0_7:
>     xorl    %r11d, %r11d
>     jmp    LBB0_11
> LBB0_8:
>     leaq    -15(%rdi), %r9
>     leaq    16(%rsi), %rax
>     movdqa    LCPI0_0(%rip), %xmm0    ## xmm0 =
> [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
>     movq    %r11, %r10
>     .p2align    4, 0x90
> LBB0_9:                                 ## =>This Inner Loop Header: Depth=1
>     movdqu    -16(%r9), %xmm1
>     movdqu    (%r9), %xmm2
>     pshufb    %xmm0, %xmm2
>     pshufb    %xmm0, %xmm1
>     movdqu    %xmm2, -16(%rax)
>     movdqu    %xmm1, (%rax)
>     addq    $-32, %r9
>     addq    $32, %rax
>     addq    $-32, %r10
>     jne    LBB0_9

Huh, so we're not disabling tree vectorization with clang, only with
GCC. Guess it hasn't generated broken code before to justify disabling it.

In any case, if clang or gcc can generate better code, then the hand
written version needs to be optimized to be as fast or faster.
Paul B Mahol Dec. 3, 2017, 7:28 p.m. UTC | #10
On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  libavfilter/hflip.h             |  38 ++++++++++++
>  libavfilter/vf_hflip.c          | 133
> ++++++++++++++++++++++++++--------------
>  libavfilter/x86/Makefile        |   2 +
>  libavfilter/x86/vf_hflip.asm    |  98 +++++++++++++++++++++++++++++
>  libavfilter/x86/vf_hflip_init.c |  41 +++++++++++++
>  5 files changed, 265 insertions(+), 47 deletions(-)
>  create mode 100644 libavfilter/hflip.h
>  create mode 100644 libavfilter/x86/vf_hflip.asm
>  create mode 100644 libavfilter/x86/vf_hflip_init.c
>

This is overall ~50% faster than pure C that gcc 6.3.0 gives with
vanilla options.
Paul B Mahol Dec. 3, 2017, 7:30 p.m. UTC | #11
On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote:
> On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote:
>> Signed-off-by: Paul B Mahol <onemda@gmail.com>
>> ---
>>  libavfilter/hflip.h             |  38 ++++++++++++
>>  libavfilter/vf_hflip.c          | 133
>> ++++++++++++++++++++++++++--------------
>>  libavfilter/x86/Makefile        |   2 +
>>  libavfilter/x86/vf_hflip.asm    |  98 +++++++++++++++++++++++++++++
>>  libavfilter/x86/vf_hflip_init.c |  41 +++++++++++++
>>  5 files changed, 265 insertions(+), 47 deletions(-)
>>  create mode 100644 libavfilter/hflip.h
>>  create mode 100644 libavfilter/x86/vf_hflip.asm
>>  create mode 100644 libavfilter/x86/vf_hflip_init.c
>>
>
> This is overall ~50% faster than pure C that gcc 6.3.0 gives with
> vanilla options.
>

By overall I mean this simple bench test:

ffmpeg -f lavfi -i smptehdbars=hd1080 -vf hflip=threads=1 -f null -
Martin Vignali Dec. 3, 2017, 7:31 p.m. UTC | #12
>
> In any case, if clang or gcc can generate better code, then the hand
> written version needs to be optimized to be as fast or faster.
>
>
>
Quick test : pass checkasm (but probably only because width = 256)
hflip_byte_c: 26.4
hflip_byte_ssse3: 20.4


INIT_XMM ssse3
cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
    mova    m0, [pb_flip_byte]
    xor     xq, xq ; <======
    mov     wd, dword wm
    sub     wq, mmsize * 2
;remove the cmp here <======
    jl .skip

    .loop0: ; process two xmm in the loop
        neg     xq
        movu    m1, [srcq + xq - mmsize + 1]
        movu    m2, [srcq + xq - mmsize * 2 + 1] <======
        pshufb  m1, m0
        pshufb  m2, m0 <======
        neg     xq
        movu    [dstq + xq], m1
        movu    [dstq + xq + mmsize], m2 <======
        add     xq, mmsize * 2 <======
        cmp     xq, wq
        jl .loop0
     RET ; add RET here

; MISSING one xmm process if need

.skip:
    add     wq, mmsize
    .loop1:
        neg    xq
        mov    vb, [srcq + xq]
        neg    xq
        mov    [dstq + xq], vb
        add    xq, 1
        cmp    xq, wq
        jl .loop1
RET


Martin
Paul B Mahol Dec. 3, 2017, 7:36 p.m. UTC | #13
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
>>
>> In any case, if clang or gcc can generate better code, then the hand
>> written version needs to be optimized to be as fast or faster.
>>
>>
>>
> Quick test : pass checkasm (but probably only because width = 256)
> hflip_byte_c: 26.4
> hflip_byte_ssse3: 20.4
>
>
> INIT_XMM ssse3
> cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
>     mova    m0, [pb_flip_byte]
>     xor     xq, xq ; <======
>     mov     wd, dword wm
>     sub     wq, mmsize * 2
> ;remove the cmp here <======
>     jl .skip
>
>     .loop0: ; process two xmm in the loop
>         neg     xq
>         movu    m1, [srcq + xq - mmsize + 1]
>         movu    m2, [srcq + xq - mmsize * 2 + 1] <======
>         pshufb  m1, m0
>         pshufb  m2, m0 <======
>         neg     xq
>         movu    [dstq + xq], m1
>         movu    [dstq + xq + mmsize], m2 <======
>         add     xq, mmsize * 2 <======
>         cmp     xq, wq
>         jl .loop0
>      RET ; add RET here
>
> ; MISSING one xmm process if need
>
> .skip:
>     add     wq, mmsize
>     .loop1:
>         neg    xq
>         mov    vb, [srcq + xq]
>         neg    xq
>         mov    [dstq + xq], vb
>         add    xq, 1
>         cmp    xq, wq
>         jl .loop1
> RET

So what is wrong now?
Martin Vignali Dec. 3, 2017, 7:41 p.m. UTC | #14
2017-12-03 20:36 GMT+01:00 Paul B Mahol <onemda@gmail.com>:

> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
> >>
> >> In any case, if clang or gcc can generate better code, then the hand
> >> written version needs to be optimized to be as fast or faster.
> >>
> >>
> >>
> > Quick test : pass checkasm (but probably only because width = 256)
> > hflip_byte_c: 26.4
> > hflip_byte_ssse3: 20.4
> >
> >
> > INIT_XMM ssse3
> > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
> >     mova    m0, [pb_flip_byte]
> >     xor     xq, xq ; <======
> >     mov     wd, dword wm
> >     sub     wq, mmsize * 2
> > ;remove the cmp here <======
> >     jl .skip
> >
> >     .loop0: ; process two xmm in the loop
> >         neg     xq
> >         movu    m1, [srcq + xq - mmsize + 1]
> >         movu    m2, [srcq + xq - mmsize * 2 + 1] <======
> >         pshufb  m1, m0
> >         pshufb  m2, m0 <======
> >         neg     xq
> >         movu    [dstq + xq], m1
> >         movu    [dstq + xq + mmsize], m2 <======
> >         add     xq, mmsize * 2 <======
> >         cmp     xq, wq
> >         jl .loop0
> >      RET ; add RET here
> >
> > ; MISSING one xmm process if need
> >
> > .skip:
> >     add     wq, mmsize
> >     .loop1:
> >         neg    xq
> >         mov    vb, [srcq + xq]
> >         neg    xq
> >         mov    [dstq + xq], vb
> >         add    xq, 1
> >         cmp    xq, wq
> >         jl .loop1
> > RET
>
> So what is wrong now?
>

Doesn't see your email, when i send mine.

Check asm result with your last patch (and modify for the short version
"add     xq, mmsize" to "add     xq, mmsize * 2")
hflip_byte_c: 28.0
hflip_byte_ssse3: 127.5
hflip_short_c: 276.5
hflip_short_ssse3: 100.2


Do you think if you add RET after the end of loop0 , it can work in all
cases ?
Paul B Mahol Dec. 3, 2017, 7:52 p.m. UTC | #15
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
> 2017-12-03 20:36 GMT+01:00 Paul B Mahol <onemda@gmail.com>:
>
>> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote:
>> >>
>> >> In any case, if clang or gcc can generate better code, then the hand
>> >> written version needs to be optimized to be as fast or faster.
>> >>
>> >>
>> >>
>> > Quick test : pass checkasm (but probably only because width = 256)
>> > hflip_byte_c: 26.4
>> > hflip_byte_ssse3: 20.4
>> >
>> >
>> > INIT_XMM ssse3
>> > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
>> >     mova    m0, [pb_flip_byte]
>> >     xor     xq, xq ; <======
>> >     mov     wd, dword wm
>> >     sub     wq, mmsize * 2
>> > ;remove the cmp here <======
>> >     jl .skip
>> >
>> >     .loop0: ; process two xmm in the loop
>> >         neg     xq
>> >         movu    m1, [srcq + xq - mmsize + 1]
>> >         movu    m2, [srcq + xq - mmsize * 2 + 1] <======
>> >         pshufb  m1, m0
>> >         pshufb  m2, m0 <======
>> >         neg     xq
>> >         movu    [dstq + xq], m1
>> >         movu    [dstq + xq + mmsize], m2 <======
>> >         add     xq, mmsize * 2 <======
>> >         cmp     xq, wq
>> >         jl .loop0
>> >      RET ; add RET here
>> >
>> > ; MISSING one xmm process if need
>> >
>> > .skip:
>> >     add     wq, mmsize
>> >     .loop1:
>> >         neg    xq
>> >         mov    vb, [srcq + xq]
>> >         neg    xq
>> >         mov    [dstq + xq], vb
>> >         add    xq, 1
>> >         cmp    xq, wq
>> >         jl .loop1
>> > RET
>>
>> So what is wrong now?
>>
>
> Doesn't see your email, when i send mine.
>
> Check asm result with your last patch (and modify for the short version
> "add     xq, mmsize" to "add     xq, mmsize * 2")
> hflip_byte_c: 28.0
> hflip_byte_ssse3: 127.5
> hflip_short_c: 276.5
> hflip_short_ssse3: 100.2
>

Ops, fixed.

>
> Do you think if you add RET after the end of loop0 , it can work in all
> cases ?

No, it would try to read before src, and crash.
diff mbox

Patch

diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
new file mode 100644
index 0000000000..138380427c
--- /dev/null
+++ b/libavfilter/hflip.h
@@ -0,0 +1,38 @@ 
+/*
+ * Copyright (c) 2007 Benoit Fouet
+ * Copyright (c) 2010 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_HFLIP_H
+#define AVFILTER_HFLIP_H
+
+#include "avfilter.h"
+
+typedef struct FlipContext {
+    const AVClass *class;
+    int max_step[4];    ///< max pixel step for each plane, expressed as a number of bytes
+    int planewidth[4];  ///< width of each plane
+    int planeheight[4]; ///< height of each plane
+
+    void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
+} FlipContext;
+
+void ff_hflip_init_x86(FlipContext *s, int step[4]);
+
+#endif /* AVFILTER_HFLIP_H */
diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
index cf20c193f7..303cc8af60 100644
--- a/libavfilter/vf_hflip.c
+++ b/libavfilter/vf_hflip.c
@@ -29,6 +29,7 @@ 
 #include "libavutil/opt.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "hflip.h"
 #include "internal.h"
 #include "video.h"
 #include "libavutil/pixdesc.h"
@@ -36,13 +37,6 @@ 
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 
-typedef struct FlipContext {
-    const AVClass *class;
-    int max_step[4];    ///< max pixel step for each plane, expressed as a number of bytes
-    int planewidth[4];  ///< width of each plane
-    int planeheight[4]; ///< height of each plane
-} FlipContext;
-
 static const AVOption hflip_options[] = {
     { NULL }
 };
@@ -67,12 +61,77 @@  static int query_formats(AVFilterContext *ctx)
     return ff_set_common_formats(ctx, pix_fmts);
 }
 
+static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
+{
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
+static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+    const uint16_t *src = (const uint16_t *)ssrc;
+    uint16_t *dst = (uint16_t *)ddst;
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
+static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+    const uint32_t *src = (const uint32_t *)ssrc;
+    uint32_t *dst = (uint32_t *)ddst;
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
+static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w)
+{
+    const uint8_t *in  = src;
+    uint8_t *out = dst;
+    int j;
+
+    for (j = 0; j < w; j++, out += 3, in -= 3) {
+        int32_t v = AV_RB24(in);
+
+        AV_WB24(out, v);
+    }
+}
+
+static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w)
+{
+    const uint8_t *in  = src;
+    uint8_t *out = dst;
+    int j;
+
+    for (j = 0; j < w; j++, out += 6, in -= 6) {
+        int64_t v = AV_RB48(in);
+
+        AV_WB48(out, v);
+    }
+}
+
+static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+    const uint64_t *src = (const uint64_t *)ssrc;
+    uint64_t *dst = (uint64_t *)ddst;
+    int j;
+
+    for (j = 0; j < w; j++)
+        dst[j] = src[-j];
+}
+
 static int config_props(AVFilterLink *inlink)
 {
     FlipContext *s = inlink->dst->priv;
     const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
     const int hsub = pix_desc->log2_chroma_w;
     const int vsub = pix_desc->log2_chroma_h;
+    int i;
 
     av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
     s->planewidth[0]  = s->planewidth[3]  = inlink->w;
@@ -80,6 +139,22 @@  static int config_props(AVFilterLink *inlink)
     s->planeheight[0] = s->planeheight[3] = inlink->h;
     s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
 
+    for (i = 0; i < 4; i++) {
+        switch (s->max_step[i]) {
+        case 1: s->flip_line[i] = hflip_byte_c;  break;
+        case 2: s->flip_line[i] = hflip_short_c; break;
+        case 3: s->flip_line[i] = hflip_b24_c;   break;
+        case 4: s->flip_line[i] = hflip_dword_c; break;
+        case 6: s->flip_line[i] = hflip_b48_c;   break;
+        case 8: s->flip_line[i] = hflip_qword_c; break;
+        default:
+            return AVERROR_BUG;
+        }
+    }
+
+    if (ARCH_X86)
+        ff_hflip_init_x86(s, s->max_step);
+
     return 0;
 }
 
@@ -94,7 +169,7 @@  static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs)
     AVFrame *in = td->in;
     AVFrame *out = td->out;
     uint8_t *inrow, *outrow;
-    int i, j, plane, step;
+    int i, plane, step;
 
     for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
         const int width  = s->planewidth[plane];
@@ -107,45 +182,7 @@  static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs)
         outrow = out->data[plane] + start * out->linesize[plane];
         inrow  = in ->data[plane] + start * in->linesize[plane] + (width - 1) * step;
         for (i = start; i < end; i++) {
-            switch (step) {
-            case 1:
-                for (j = 0; j < width; j++)
-                    outrow[j] = inrow[-j];
-            break;
-
-            case 2:
-            {
-                uint16_t *outrow16 = (uint16_t *)outrow;
-                uint16_t * inrow16 = (uint16_t *) inrow;
-                for (j = 0; j < width; j++)
-                    outrow16[j] = inrow16[-j];
-            }
-            break;
-
-            case 3:
-            {
-                uint8_t *in  =  inrow;
-                uint8_t *out = outrow;
-                for (j = 0; j < width; j++, out += 3, in -= 3) {
-                    int32_t v = AV_RB24(in);
-                    AV_WB24(out, v);
-                }
-            }
-            break;
-
-            case 4:
-            {
-                uint32_t *outrow32 = (uint32_t *)outrow;
-                uint32_t * inrow32 = (uint32_t *) inrow;
-                for (j = 0; j < width; j++)
-                    outrow32[j] = inrow32[-j];
-            }
-            break;
-
-            default:
-                for (j = 0; j < width; j++)
-                    memcpy(outrow + j*step, inrow - j*step, step);
-            }
+            s->flip_line[plane](inrow, outrow, width);
 
             inrow  += in ->linesize[plane];
             outrow += out->linesize[plane];
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 3431625883..1420954f62 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -5,6 +5,7 @@  OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
+OBJS-$(CONFIG_HFLIP_FILTER)                  += x86/vf_hflip_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_interlace_init.o
@@ -31,6 +32,7 @@  X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
+X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
 X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
 X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
 X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
new file mode 100644
index 0000000000..2fea31db61
--- /dev/null
+++ b/libavfilter/x86/vf_hflip.asm
@@ -0,0 +1,92 @@ 
+;*****************************************************************************
+;* x86-optimized functions for hflip filter
+;*
+;* Copyright (C) 2017 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_flip_byte:  times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v
+    mova    m0, [pb_flip_byte]
+    mov     xq, 0
+    mov     wd, dword wm
+    sub     wq, mmsize
+    cmp     wq, mmsize
+    jl .skip
+
+    .loop0:
+        neg     xq
+        movu    m1, [srcq + xq - mmsize + 1]
+        pshufb  m1, m0
+        neg     xq
+        movu    [dstq + xq], m1
+        add     xq, mmsize
+        cmp     xq, wq
+        jl .loop0
+
+.skip:
+    add     wq, mmsize
+    .loop1:
+        neg    xq
+        mov    vb, [srcq + xq]
+        neg    xq
+        mov    [dstq + xq], vb
+        add    xq, 1
+        cmp    xq, wq
+        jl .loop1
+RET
+
+cglobal hflip_short, 3, 5, 2, src, dst, w, x, v
+    mova    m0, [pb_flip_short]
+    mov     xq, 0
+    mov     wd, dword wm
+    add     wq, wq
+    sub     wq, mmsize
+    cmp     wq, mmsize
+    jl .skip
+
+    .loop0:
+        neg     xq
+        movu    m1, [srcq + xq - mmsize + 2]
+        pshufb  m1, m0
+        neg     xq
+        movu    [dstq + xq], m1
+        add     xq, mmsize
+        cmp     xq, wq
+        jl .loop0
+
+.skip:
+    add     wq, mmsize
+    .loop1:
+        neg    xq
+        mov    vw, [srcq + xq]
+        neg    xq
+        mov    [dstq + xq], vw
+        add    xq, 2
+        cmp    xq, wq
+        jl .loop1
+RET
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
new file mode 100644
index 0000000000..d8eab1f905
--- /dev/null
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -0,0 +1,41 @@ 
+/*
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/hflip.h"
+
+void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
+
+av_cold void ff_hflip_init_x86(FlipContext *s, int step[4])
+{
+    int cpu_flags = av_get_cpu_flags();
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) {
+            s->flip_line[i] = ff_hflip_byte_ssse3;
+        } else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) {
+            s->flip_line[i] = ff_hflip_short_ssse3;
+        }
+    }
+}