Message ID | 20171202115856.340-1-onemda@gmail.com |
---|---|
State | Superseded |
Headers | show |
> + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +pb_flip_byte: times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 > +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 > + > times 16 ? Martin
On 12/2/17, Martin Vignali <martin.vignali@gmail.com> wrote: >> + >> +%include "libavutil/x86/x86util.asm" >> + >> +SECTION_RODATA >> + >> +pb_flip_byte: times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 >> +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 >> + >> > > times 16 ? Removed.
Hello, Maybe you can use a macro for byte and short version, only few lines are different in each version Martin
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: > Hello, > > Maybe you can use a macro for byte and short version, > only few lines are different in each version Sure, feel free to send patches. I'm not very macro proficient.
2017-12-03 17:46 GMT+01:00 Paul B Mahol <onemda@gmail.com>: > On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: > > Hello, > > > > Maybe you can use a macro for byte and short version, > > only few lines are different in each version > > Sure, feel free to send patches. > > I'm not very macro proficient. > Ok, i will take a look. Martin
> 2017-12-03 17:46 GMT+01:00 Paul B Mahol <onemda@gmail.com>: > >> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: >> > Hello, >> > >> > Maybe you can use a macro for byte and short version, >> > only few lines are different in each version >> >> Sure, feel free to send patches. >> >> I'm not very macro proficient. >> > > Ok, i will take a look. > > Martin > I write a basic checkasm test. Seems like the byte version is slower than c hflip_byte_c: 31.8 hflip_byte_ssse3: 108.1 hflip_short_c: 300.1 hflip_short_ssse3: 139.8 (checkasm patch in attach if you want to test) Martin
On 12/3/2017 3:09 PM, Martin Vignali wrote: >> 2017-12-03 17:46 GMT+01:00 Paul B Mahol <onemda@gmail.com>: >> >>> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: >>>> Hello, >>>> >>>> Maybe you can use a macro for byte and short version, >>>> only few lines are different in each version >>> >>> Sure, feel free to send patches. >>> >>> I'm not very macro proficient. >>> >> >> Ok, i will take a look. >> >> Martin >> > > I write a basic checkasm test. Seems like the byte version is slower than c > > hflip_byte_c: 31.8 > hflip_byte_ssse3: 108.1 > hflip_short_c: 300.1 > hflip_short_ssse3: 139.8 > > (checkasm patch in attach if you want to test) > > Martin $ tests/checkasm/checkasm.exe --test=vf_hflip --bench benchmarking with native FFmpeg timers nop: 32.0 hflip_byte_c: 362.0 hflip_byte_ssse3: 96.0 hflip_short_c: 374.0 hflip_short_ssse3: 121.0 Guess your compiler is really good at optimizing this code, or something funny is going on. Can you post a disassembly of hflip_byte_c?
> Can you post a disassembly of hflip_byte_c? > > > in O1 : clang -S -O1 test_asm_gen.c .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .globl _hflip_byte_c .p2align 4, 0x90 _hflip_byte_c: ## @hflip_byte_c .cfi_startproc ## BB#0: pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp testl %edx, %edx jle LBB0_3 ## BB#1: movl %edx, %eax .p2align 4, 0x90 LBB0_2: ## =>This Inner Loop Header: Depth=1 movzbl (%rdi), %ecx movb %cl, (%rsi) decq %rdi incq %rsi decq %rax jne LBB0_2 LBB0_3: popq %rbp retq .cfi_endproc .subsections_via_symbols in O2 or O3 : clang -S -O3 test_asm_gen.c If i correctly understand, same idea than paul's patch but processing two xmm in the main loop .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .section __TEXT,__literal16,16byte_literals .p2align 4 LCPI0_0: .byte 15 ## 0xf .byte 14 ## 0xe .byte 13 ## 0xd .byte 12 ## 0xc .byte 11 ## 0xb .byte 10 ## 0xa .byte 9 ## 0x9 .byte 8 ## 0x8 .byte 7 ## 0x7 .byte 6 ## 0x6 .byte 5 ## 0x5 .byte 4 ## 0x4 .byte 3 ## 0x3 .byte 2 ## 0x2 .byte 1 ## 0x1 .byte 0 ## 0x0 .section __TEXT,__text,regular,pure_instructions .globl _hflip_byte_c .p2align 4, 0x90 _hflip_byte_c: ## @hflip_byte_c .cfi_startproc ## BB#0: pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp ## kill: %EDX<def> %EDX<kill> %RDX<def> testl %edx, %edx jle LBB0_17 ## BB#1: movl %edx, %r8d cmpl $32, %edx jae LBB0_3 ## BB#2: xorl %r11d, %r11d jmp LBB0_11 LBB0_3: andl $31, %edx movq %r8, %r11 subq %rdx, %r11 je LBB0_7 ## BB#4: leaq 1(%rdi), %rax cmpq %rsi, %rax jbe LBB0_8 ## BB#5: leaq (%rsi,%r8), %r9 movl $1, %eax subq %r8, %rax addq %rdi, %rax cmpq %r9, %rax jae LBB0_8 LBB0_7: xorl %r11d, %r11d jmp LBB0_11 LBB0_8: leaq -15(%rdi), %r9 leaq 16(%rsi), %rax movdqa LCPI0_0(%rip), %xmm0 ## xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] movq %r11, %r10 .p2align 4, 0x90 LBB0_9: ## =>This Inner Loop Header: Depth=1 movdqu -16(%r9), %xmm1 movdqu (%r9), %xmm2 pshufb %xmm0, %xmm2 pshufb %xmm0, %xmm1 movdqu %xmm2, -16(%rax) movdqu %xmm1, (%rax) addq $-32, %r9 addq $32, %rax addq $-32, %r10 jne LBB0_9 ## BB#10: testl %edx, %edx je LBB0_17 LBB0_11: movl %r8d, %eax subl %r11d, %eax leaq -1(%r8), %r9 subq %r11, %r9 andq $3, %rax je LBB0_14 ## BB#12: movq %rdi, %rdx subq %r11, %rdx negq %rax .p2align 4, 0x90 LBB0_13: ## =>This Inner Loop Header: Depth=1 movzbl (%rdx), %ecx movb %cl, (%rsi,%r11) incq %r11 decq %rdx incq %rax jne LBB0_13 LBB0_14: cmpq $3, %r9 jb LBB0_17 ## BB#15: subq %r11, %r8 subq %r11, %rdi leaq 3(%rsi,%r11), %rax .p2align 4, 0x90 LBB0_16: ## =>This Inner Loop Header: Depth=1 movzbl (%rdi), %ecx movb %cl, -3(%rax) movzbl -1(%rdi), %ecx movb %cl, -2(%rax) movzbl -2(%rdi), %ecx movb %cl, -1(%rax) movzbl -3(%rdi), %ecx movb %cl, (%rax) addq $-4, %rdi addq $4, %rax addq $-4, %r8 jne LBB0_16 LBB0_17: popq %rbp retq .cfi_endproc .subsections_via_symbols
On 12/3/2017 3:55 PM, Martin Vignali wrote: > in O2 or O3 : clang -S -O3 test_asm_gen.c > > If i correctly understand, same idea than paul's patch > but processing two xmm in the main loop > > .section __TEXT,__text,regular,pure_instructions > .macosx_version_min 10, 12 > .section __TEXT,__literal16,16byte_literals > .p2align 4 > LCPI0_0: > .byte 15 ## 0xf > .byte 14 ## 0xe > .byte 13 ## 0xd > .byte 12 ## 0xc > .byte 11 ## 0xb > .byte 10 ## 0xa > .byte 9 ## 0x9 > .byte 8 ## 0x8 > .byte 7 ## 0x7 > .byte 6 ## 0x6 > .byte 5 ## 0x5 > .byte 4 ## 0x4 > .byte 3 ## 0x3 > .byte 2 ## 0x2 > .byte 1 ## 0x1 > .byte 0 ## 0x0 > .section __TEXT,__text,regular,pure_instructions > .globl _hflip_byte_c > .p2align 4, 0x90 > _hflip_byte_c: ## @hflip_byte_c > .cfi_startproc > ## BB#0: > pushq %rbp > Ltmp0: > .cfi_def_cfa_offset 16 > Ltmp1: > .cfi_offset %rbp, -16 > movq %rsp, %rbp > Ltmp2: > .cfi_def_cfa_register %rbp > ## kill: %EDX<def> %EDX<kill> > %RDX<def> > testl %edx, %edx > jle LBB0_17 > ## BB#1: > movl %edx, %r8d > cmpl $32, %edx > jae LBB0_3 > ## BB#2: > xorl %r11d, %r11d > jmp LBB0_11 > LBB0_3: > andl $31, %edx > movq %r8, %r11 > subq %rdx, %r11 > je LBB0_7 > ## BB#4: > leaq 1(%rdi), %rax > cmpq %rsi, %rax > jbe LBB0_8 > ## BB#5: > leaq (%rsi,%r8), %r9 > movl $1, %eax > subq %r8, %rax > addq %rdi, %rax > cmpq %r9, %rax > jae LBB0_8 > LBB0_7: > xorl %r11d, %r11d > jmp LBB0_11 > LBB0_8: > leaq -15(%rdi), %r9 > leaq 16(%rsi), %rax > movdqa LCPI0_0(%rip), %xmm0 ## xmm0 = > [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] > movq %r11, %r10 > .p2align 4, 0x90 > LBB0_9: ## =>This Inner Loop Header: Depth=1 > movdqu -16(%r9), %xmm1 > movdqu (%r9), %xmm2 > pshufb %xmm0, %xmm2 > pshufb %xmm0, %xmm1 > movdqu %xmm2, -16(%rax) > movdqu %xmm1, (%rax) > addq $-32, %r9 > addq $32, %rax > addq $-32, %r10 > jne LBB0_9 Huh, so we're not disabling tree vectorization with clang, only with GCC. Guess it hasn't generated broken code before to justify disabling it. In any case, if clang or gcc can generate better code, then the hand written version needs to be optimized to be as fast or faster.
On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote: > Signed-off-by: Paul B Mahol <onemda@gmail.com> > --- > libavfilter/hflip.h | 38 ++++++++++++ > libavfilter/vf_hflip.c | 133 > ++++++++++++++++++++++++++-------------- > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_hflip.asm | 98 +++++++++++++++++++++++++++++ > libavfilter/x86/vf_hflip_init.c | 41 +++++++++++++ > 5 files changed, 265 insertions(+), 47 deletions(-) > create mode 100644 libavfilter/hflip.h > create mode 100644 libavfilter/x86/vf_hflip.asm > create mode 100644 libavfilter/x86/vf_hflip_init.c > This is overall ~50% faster than pure C that gcc 6.3.0 gives with vanilla options.
On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote: > On 12/3/17, Paul B Mahol <onemda@gmail.com> wrote: >> Signed-off-by: Paul B Mahol <onemda@gmail.com> >> --- >> libavfilter/hflip.h | 38 ++++++++++++ >> libavfilter/vf_hflip.c | 133 >> ++++++++++++++++++++++++++-------------- >> libavfilter/x86/Makefile | 2 + >> libavfilter/x86/vf_hflip.asm | 98 +++++++++++++++++++++++++++++ >> libavfilter/x86/vf_hflip_init.c | 41 +++++++++++++ >> 5 files changed, 265 insertions(+), 47 deletions(-) >> create mode 100644 libavfilter/hflip.h >> create mode 100644 libavfilter/x86/vf_hflip.asm >> create mode 100644 libavfilter/x86/vf_hflip_init.c >> > > This is overall ~50% faster than pure C that gcc 6.3.0 gives with > vanilla options. > By overall I mean this simple bench test: ffmpeg -f lavfi -i smptehdbars=hd1080 -vf hflip=threads=1 -f null -
> > In any case, if clang or gcc can generate better code, then the hand > written version needs to be optimized to be as fast or faster. > > > Quick test : pass checkasm (but probably only because width = 256) hflip_byte_c: 26.4 hflip_byte_ssse3: 20.4 INIT_XMM ssse3 cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 mova m0, [pb_flip_byte] xor xq, xq ; <====== mov wd, dword wm sub wq, mmsize * 2 ;remove the cmp here <====== jl .skip .loop0: ; process two xmm in the loop neg xq movu m1, [srcq + xq - mmsize + 1] movu m2, [srcq + xq - mmsize * 2 + 1] <====== pshufb m1, m0 pshufb m2, m0 <====== neg xq movu [dstq + xq], m1 movu [dstq + xq + mmsize], m2 <====== add xq, mmsize * 2 <====== cmp xq, wq jl .loop0 RET ; add RET here ; MISSING one xmm process if need .skip: add wq, mmsize .loop1: neg xq mov vb, [srcq + xq] neg xq mov [dstq + xq], vb add xq, 1 cmp xq, wq jl .loop1 RET Martin
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: >> >> In any case, if clang or gcc can generate better code, then the hand >> written version needs to be optimized to be as fast or faster. >> >> >> > Quick test : pass checkasm (but probably only because width = 256) > hflip_byte_c: 26.4 > hflip_byte_ssse3: 20.4 > > > INIT_XMM ssse3 > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 > mova m0, [pb_flip_byte] > xor xq, xq ; <====== > mov wd, dword wm > sub wq, mmsize * 2 > ;remove the cmp here <====== > jl .skip > > .loop0: ; process two xmm in the loop > neg xq > movu m1, [srcq + xq - mmsize + 1] > movu m2, [srcq + xq - mmsize * 2 + 1] <====== > pshufb m1, m0 > pshufb m2, m0 <====== > neg xq > movu [dstq + xq], m1 > movu [dstq + xq + mmsize], m2 <====== > add xq, mmsize * 2 <====== > cmp xq, wq > jl .loop0 > RET ; add RET here > > ; MISSING one xmm process if need > > .skip: > add wq, mmsize > .loop1: > neg xq > mov vb, [srcq + xq] > neg xq > mov [dstq + xq], vb > add xq, 1 > cmp xq, wq > jl .loop1 > RET So what is wrong now?
2017-12-03 20:36 GMT+01:00 Paul B Mahol <onemda@gmail.com>: > On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: > >> > >> In any case, if clang or gcc can generate better code, then the hand > >> written version needs to be optimized to be as fast or faster. > >> > >> > >> > > Quick test : pass checkasm (but probably only because width = 256) > > hflip_byte_c: 26.4 > > hflip_byte_ssse3: 20.4 > > > > > > INIT_XMM ssse3 > > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 > > mova m0, [pb_flip_byte] > > xor xq, xq ; <====== > > mov wd, dword wm > > sub wq, mmsize * 2 > > ;remove the cmp here <====== > > jl .skip > > > > .loop0: ; process two xmm in the loop > > neg xq > > movu m1, [srcq + xq - mmsize + 1] > > movu m2, [srcq + xq - mmsize * 2 + 1] <====== > > pshufb m1, m0 > > pshufb m2, m0 <====== > > neg xq > > movu [dstq + xq], m1 > > movu [dstq + xq + mmsize], m2 <====== > > add xq, mmsize * 2 <====== > > cmp xq, wq > > jl .loop0 > > RET ; add RET here > > > > ; MISSING one xmm process if need > > > > .skip: > > add wq, mmsize > > .loop1: > > neg xq > > mov vb, [srcq + xq] > > neg xq > > mov [dstq + xq], vb > > add xq, 1 > > cmp xq, wq > > jl .loop1 > > RET > > So what is wrong now? > Doesn't see your email, when i send mine. Check asm result with your last patch (and modify for the short version "add xq, mmsize" to "add xq, mmsize * 2") hflip_byte_c: 28.0 hflip_byte_ssse3: 127.5 hflip_short_c: 276.5 hflip_short_ssse3: 100.2 Do you think if you add RET after the end of loop0 , it can work in all cases ?
On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: > 2017-12-03 20:36 GMT+01:00 Paul B Mahol <onemda@gmail.com>: > >> On 12/3/17, Martin Vignali <martin.vignali@gmail.com> wrote: >> >> >> >> In any case, if clang or gcc can generate better code, then the hand >> >> written version needs to be optimized to be as fast or faster. >> >> >> >> >> >> >> > Quick test : pass checkasm (but probably only because width = 256) >> > hflip_byte_c: 26.4 >> > hflip_byte_ssse3: 20.4 >> > >> > >> > INIT_XMM ssse3 >> > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 >> > mova m0, [pb_flip_byte] >> > xor xq, xq ; <====== >> > mov wd, dword wm >> > sub wq, mmsize * 2 >> > ;remove the cmp here <====== >> > jl .skip >> > >> > .loop0: ; process two xmm in the loop >> > neg xq >> > movu m1, [srcq + xq - mmsize + 1] >> > movu m2, [srcq + xq - mmsize * 2 + 1] <====== >> > pshufb m1, m0 >> > pshufb m2, m0 <====== >> > neg xq >> > movu [dstq + xq], m1 >> > movu [dstq + xq + mmsize], m2 <====== >> > add xq, mmsize * 2 <====== >> > cmp xq, wq >> > jl .loop0 >> > RET ; add RET here >> > >> > ; MISSING one xmm process if need >> > >> > .skip: >> > add wq, mmsize >> > .loop1: >> > neg xq >> > mov vb, [srcq + xq] >> > neg xq >> > mov [dstq + xq], vb >> > add xq, 1 >> > cmp xq, wq >> > jl .loop1 >> > RET >> >> So what is wrong now? >> > > Doesn't see your email, when i send mine. > > Check asm result with your last patch (and modify for the short version > "add xq, mmsize" to "add xq, mmsize * 2") > hflip_byte_c: 28.0 > hflip_byte_ssse3: 127.5 > hflip_short_c: 276.5 > hflip_short_ssse3: 100.2 > Ops, fixed. > > Do you think if you add RET after the end of loop0 , it can work in all > cases ? No, it would try to read before src, and crash.
diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h new file mode 100644 index 0000000000..138380427c --- /dev/null +++ b/libavfilter/hflip.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2007 Benoit Fouet + * Copyright (c) 2010 Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_HFLIP_H +#define AVFILTER_HFLIP_H + +#include "avfilter.h" + +typedef struct FlipContext { + const AVClass *class; + int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes + int planewidth[4]; ///< width of each plane + int planeheight[4]; ///< height of each plane + + void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w); +} FlipContext; + +void ff_hflip_init_x86(FlipContext *s, int step[4]); + +#endif /* AVFILTER_HFLIP_H */ diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c index cf20c193f7..303cc8af60 100644 --- a/libavfilter/vf_hflip.c +++ b/libavfilter/vf_hflip.c @@ -29,6 +29,7 @@ #include "libavutil/opt.h" #include "avfilter.h" #include "formats.h" +#include "hflip.h" #include "internal.h" #include "video.h" #include "libavutil/pixdesc.h" @@ -36,13 +37,6 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" -typedef struct FlipContext { - const AVClass *class; - int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes - int planewidth[4]; ///< width of each plane - int planeheight[4]; ///< height of each plane -} FlipContext; - static const AVOption hflip_options[] = { { NULL } }; @@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, pix_fmts); } +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w) +{ + int j; + + for (j = 0; j < w; j++) + dst[j] = src[-j]; +} + +static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ + const uint16_t *src = (const uint16_t *)ssrc; + uint16_t *dst = (uint16_t *)ddst; + int j; + + for (j = 0; j < w; j++) + dst[j] = src[-j]; +} + +static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ + const uint32_t *src = (const uint32_t *)ssrc; + uint32_t *dst = (uint32_t *)ddst; + int j; + + for (j = 0; j < w; j++) + dst[j] = src[-j]; +} + +static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w) +{ + const uint8_t *in = src; + uint8_t *out = dst; + int j; + + for (j = 0; j < w; j++, out += 3, in -= 3) { + int32_t v = AV_RB24(in); + + AV_WB24(out, v); + } +} + +static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w) +{ + const uint8_t *in = src; + uint8_t *out = dst; + int j; + + for (j = 0; j < w; j++, out += 6, in -= 6) { + int64_t v = AV_RB48(in); + + AV_WB48(out, v); + } +} + +static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ + const uint64_t *src = (const uint64_t *)ssrc; + uint64_t *dst = (uint64_t *)ddst; + int j; + + for (j = 0; j < w; j++) + dst[j] = src[-j]; +} + static int config_props(AVFilterLink *inlink) { FlipContext *s = inlink->dst->priv; const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); const int hsub = pix_desc->log2_chroma_w; const int vsub = pix_desc->log2_chroma_h; + int i; av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc); s->planewidth[0] = s->planewidth[3] = inlink->w; @@ -80,6 +139,22 @@ static int config_props(AVFilterLink *inlink) s->planeheight[0] = s->planeheight[3] = inlink->h; s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); + for (i = 0; i < 4; i++) { + switch (s->max_step[i]) { + case 1: s->flip_line[i] = hflip_byte_c; break; + case 2: s->flip_line[i] = hflip_short_c; break; + case 3: s->flip_line[i] = hflip_b24_c; break; + case 4: s->flip_line[i] = hflip_dword_c; break; + case 6: s->flip_line[i] = hflip_b48_c; break; + case 8: s->flip_line[i] = hflip_qword_c; break; + default: + return AVERROR_BUG; + } + } + + if (ARCH_X86) + ff_hflip_init_x86(s, s->max_step); + return 0; } @@ -94,7 +169,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs) AVFrame *in = td->in; AVFrame *out = td->out; uint8_t *inrow, *outrow; - int i, j, plane, step; + int i, plane, step; for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) { const int width = s->planewidth[plane]; @@ -107,45 +182,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs) outrow = out->data[plane] + start * out->linesize[plane]; inrow = in ->data[plane] + start * in->linesize[plane] + (width - 1) * step; for (i = start; i < end; i++) { - switch (step) { - case 1: - for (j = 0; j < width; j++) - outrow[j] = inrow[-j]; - break; - - case 2: - { - uint16_t *outrow16 = (uint16_t *)outrow; - uint16_t * inrow16 = (uint16_t *) inrow; - for (j = 0; j < width; j++) - outrow16[j] = inrow16[-j]; - } - break; - - case 3: - { - uint8_t *in = inrow; - uint8_t *out = outrow; - for (j = 0; j < width; j++, out += 3, in -= 3) { - int32_t v = AV_RB24(in); - AV_WB24(out, v); - } - } - break; - - case 4: - { - uint32_t *outrow32 = (uint32_t *)outrow; - uint32_t * inrow32 = (uint32_t *) inrow; - for (j = 0; j < width; j++) - outrow32[j] = inrow32[-j]; - } - break; - - default: - for (j = 0; j < width; j++) - memcpy(outrow + j*step, inrow - j*step, step); - } + s->flip_line[plane](inrow, outrow, width); inrow += in ->linesize[plane]; outrow += out->linesize[plane]; diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 3431625883..1420954f62 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -5,6 +5,7 @@ OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o +OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o @@ -31,6 +32,7 @@ X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o +X86ASM-OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip.o X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm new file mode 100644 index 0000000000..2fea31db61 --- /dev/null +++ b/libavfilter/x86/vf_hflip.asm @@ -0,0 +1,92 @@ +;***************************************************************************** +;* x86-optimized functions for hflip filter +;* +;* Copyright (C) 2017 Paul B Mahol +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_flip_byte: times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 + +SECTION .text + +INIT_XMM ssse3 +cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v + mova m0, [pb_flip_byte] + mov xq, 0 + mov wd, dword wm + sub wq, mmsize + cmp wq, mmsize + jl .skip + + .loop0: + neg xq + movu m1, [srcq + xq - mmsize + 1] + pshufb m1, m0 + neg xq + movu [dstq + xq], m1 + add xq, mmsize + cmp xq, wq + jl .loop0 + +.skip: + add wq, mmsize + .loop1: + neg xq + mov vb, [srcq + xq] + neg xq + mov [dstq + xq], vb + add xq, 1 + cmp xq, wq + jl .loop1 +RET + +cglobal hflip_short, 3, 5, 2, src, dst, w, x, v + mova m0, [pb_flip_short] + mov xq, 0 + mov wd, dword wm + add wq, wq + sub wq, mmsize + cmp wq, mmsize + jl .skip + + .loop0: + neg xq + movu m1, [srcq + xq - mmsize + 2] + pshufb m1, m0 + neg xq + movu [dstq + xq], m1 + add xq, mmsize + cmp xq, wq + jl .loop0 + +.skip: + add wq, mmsize + .loop1: + neg xq + mov vw, [srcq + xq] + neg xq + mov [dstq + xq], vw + add xq, 2 + cmp xq, wq + jl .loop1 +RET diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c new file mode 100644 index 0000000000..d8eab1f905 --- /dev/null +++ b/libavfilter/x86/vf_hflip_init.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017 Paul B Mahol + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/hflip.h" + +void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); + +av_cold void ff_hflip_init_x86(FlipContext *s, int step[4]) +{ + int cpu_flags = av_get_cpu_flags(); + int i; + + for (i = 0; i < 4; i++) { + if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) { + s->flip_line[i] = ff_hflip_byte_ssse3; + } else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) { + s->flip_line[i] = ff_hflip_short_ssse3; + } + } +}
Signed-off-by: Paul B Mahol <onemda@gmail.com> --- libavfilter/hflip.h | 38 ++++++++++++ libavfilter/vf_hflip.c | 131 ++++++++++++++++++++++++++-------------- libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_hflip.asm | 92 ++++++++++++++++++++++++++++ libavfilter/x86/vf_hflip_init.c | 41 +++++++++++++ 5 files changed, 257 insertions(+), 47 deletions(-) create mode 100644 libavfilter/hflip.h create mode 100644 libavfilter/x86/vf_hflip.asm create mode 100644 libavfilter/x86/vf_hflip_init.c