diff mbox

[FFmpeg-devel,4/4] avfilter/vf_framerate: add SIMD functions for frame blending

Message ID 20180118000332.31119-4-cus@passwd.hu
State Superseded
Headers show

Commit Message

Marton Balint Jan. 18, 2018, 12:03 a.m. UTC
Blend function speedups on x86_64 Core i5 4460:

ffmpeg -f lavfi -i allyuv -vf framerate=60:threads=1 -f null none

C:     447548411 decicycles in Blend,    2048 runs,      0 skips
SSSE3: 130020087 decicycles in Blend,    2048 runs,      0 skips
AVX2:  128508221 decicycles in Blend,    2048 runs,      0 skips

ffmpeg -f lavfi -i allyuv -vf format=yuv420p12,framerate=60:threads=1 -f null none

C:     228932745 decicycles in Blend,    2048 runs,      0 skips
SSE4:  123357781 decicycles in Blend,    2048 runs,      0 skips
AVX2:  121215353 decicycles in Blend,    2048 runs,      0 skips

Signed-off-by: Marton Balint <cus@passwd.hu>
---
 libavfilter/vf_framerate.c       |  24 ++++++-
 libavfilter/x86/Makefile         |   1 +
 libavfilter/x86/vf_framerate.asm | 131 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 153 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/x86/vf_framerate.asm

Comments

Carl Eugen Hoyos Jan. 18, 2018, 9:30 a.m. UTC | #1
2018-01-18 1:03 GMT+01:00 Marton Balint <cus@passwd.hu>:
> Blend function speedups on x86_64 Core i5 4460:
>
> ffmpeg -f lavfi -i allyuv -vf framerate=60:threads=1 -f null none
>
> C:     447548411 decicycles in Blend,    2048 runs,      0 skips
> SSSE3: 130020087 decicycles in Blend,    2048 runs,      0 skips
> AVX2:  128508221 decicycles in Blend,    2048 runs,      0 skips
>
> ffmpeg -f lavfi -i allyuv -vf format=yuv420p12,framerate=60:threads=1 -f null none
>
> C:     228932745 decicycles in Blend,    2048 runs,      0 skips
> SSE4:  123357781 decicycles in Blend,    2048 runs,      0 skips
> AVX2:  121215353 decicycles in Blend,    2048 runs,      0 skips

Is the avx2 version really useful?

Carl Eugen
Martin Vignali Jan. 18, 2018, 11:39 a.m. UTC | #2
>      if (s->bitdepth == 8) {
>          s->blend_factor_max = 1 << BLEND_FACTOR_DEPTH8;
> -        s->blend = blend_frames_c;
> +        if (ARCH_X86 && EXTERNAL_AVX2(cpu_flags))
> +            s->blend = ff_blend_frames_avx2;
>

I think it's :
if (EXTERNAL_AVX2_FAST(cpu_flags)


> +        else if (ARCH_X86 && EXTERNAL_SSSE3(cpu_flags))
> +            s->blend = ff_blend_frames_ssse3;
> +        else
> +            s->blend = blend_frames_c;
>      } else {
>          s->blend_factor_max = 1 << BLEND_FACTOR_DEPTH16;
> -        s->blend = blend_frames16_c;
> +        if (ARCH_X86 && EXTERNAL_AVX2(cpu_flags))
> +            s->blend = ff_blend_frames16_avx2;
>

same here

+        else if (ARCH_X86 && EXTERNAL_SSE4(cpu_flags))
> +            s->blend = ff_blend_frames16_sse4;
> +        else
> +            s->blend = blend_frames16_c;
>      }
>
>
+
> +
> +INIT_XMM ssse3
> +BLEND_FRAMES
> +
> +INIT_YMM avx2
> +BLEND_FRAMES
>

Probably need
%if HAVE_AVX2_EXTERNAL

%end if (around INIT_YMM avx2....)


> +
> +INIT_XMM sse4
> +BLEND_FRAMES16
> +
> +INIT_YMM avx2
> +BLEND_FRAMES16
> --
>
>
Martin
Marton Balint Jan. 18, 2018, 8:40 p.m. UTC | #3
On Thu, 18 Jan 2018, Carl Eugen Hoyos wrote:

> 2018-01-18 1:03 GMT+01:00 Marton Balint <cus@passwd.hu>:
>> Blend function speedups on x86_64 Core i5 4460:
>>
>> ffmpeg -f lavfi -i allyuv -vf framerate=60:threads=1 -f null none
>>
>> C:     447548411 decicycles in Blend,    2048 runs,      0 skips
>> SSSE3: 130020087 decicycles in Blend,    2048 runs,      0 skips
>> AVX2:  128508221 decicycles in Blend,    2048 runs,      0 skips
>>
>> ffmpeg -f lavfi -i allyuv -vf format=yuv420p12,framerate=60:threads=1 -f null none
>>
>> C:     228932745 decicycles in Blend,    2048 runs,      0 skips
>> SSE4:  123357781 decicycles in Blend,    2048 runs,      0 skips
>> AVX2:  121215353 decicycles in Blend,    2048 runs,      0 skips
>
> Is the avx2 version really useful?

Well, it _is_ faster, even if the speedup is not significant. The ASM code 
is almost the same, so almost no maitenance burden is invovled in keeping 
both.

Regards,
Marton
diff mbox

Patch

diff --git a/libavfilter/vf_framerate.c b/libavfilter/vf_framerate.c
index d315ef5d09..1c10699c3a 100644
--- a/libavfilter/vf_framerate.c
+++ b/libavfilter/vf_framerate.c
@@ -29,11 +29,13 @@ 
 #define DEBUG
 
 #include "libavutil/avassert.h"
+#include "libavutil/cpu.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/pixelutils.h"
+#include "libavutil/x86/cpu.h"
 
 #include "avfilter.h"
 #include "internal.h"
@@ -246,7 +248,7 @@  static int blend_frames(AVFilterContext *ctx, int interpolate)
         av_frame_copy_props(s->work, s->f0);
 
         ff_dlog(ctx, "blend_frames() INTERPOLATE to create work frame\n");
-        ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(outlink->h, ff_filter_get_nb_threads(ctx)));
+        ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(FFMAX(1, outlink->h >> 2), ff_filter_get_nb_threads(ctx)));
         return 1;
     }
     return 0;
@@ -347,6 +349,11 @@  static void blend_frames_c(BLEND_FUNC_PARAMS)
     }
 }
 
+void ff_blend_frames_ssse3(BLEND_FUNC_PARAMS);
+void ff_blend_frames_avx2(BLEND_FUNC_PARAMS);
+void ff_blend_frames16_sse4(BLEND_FUNC_PARAMS);
+void ff_blend_frames16_avx2(BLEND_FUNC_PARAMS);
+
 static void blend_frames16_c(BLEND_FUNC_PARAMS)
 {
     int line, pixel;
@@ -371,6 +378,7 @@  static int config_input(AVFilterLink *inlink)
     AVFilterContext *ctx = inlink->dst;
     FrameRateContext *s = ctx->priv;
     const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
+    int cpu_flags = av_get_cpu_flags();
     int plane;
 
     for (plane = 0; plane < 4; plane++) {
@@ -389,10 +397,20 @@  static int config_input(AVFilterLink *inlink)
 
     if (s->bitdepth == 8) {
         s->blend_factor_max = 1 << BLEND_FACTOR_DEPTH8;
-        s->blend = blend_frames_c;
+        if (ARCH_X86 && EXTERNAL_AVX2(cpu_flags))
+            s->blend = ff_blend_frames_avx2;
+        else if (ARCH_X86 && EXTERNAL_SSSE3(cpu_flags))
+            s->blend = ff_blend_frames_ssse3;
+        else
+            s->blend = blend_frames_c;
     } else {
         s->blend_factor_max = 1 << BLEND_FACTOR_DEPTH16;
-        s->blend = blend_frames16_c;
+        if (ARCH_X86 && EXTERNAL_AVX2(cpu_flags))
+            s->blend = ff_blend_frames16_avx2;
+        else if (ARCH_X86 && EXTERNAL_SSE4(cpu_flags))
+            s->blend = ff_blend_frames16_sse4;
+        else
+            s->blend = blend_frames16_c;
     }
 
     return 0;
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 2fc5c62644..07fc0f9486 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -31,6 +31,7 @@  X86ASM-OBJS-$(CONFIG_AFIR_FILTER)            += x86/af_afir.o
 X86ASM-OBJS-$(CONFIG_BLEND_FILTER)           += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
+X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += x86/vf_framerate.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
 X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
diff --git a/libavfilter/x86/vf_framerate.asm b/libavfilter/x86/vf_framerate.asm
new file mode 100644
index 0000000000..0b1bed821f
--- /dev/null
+++ b/libavfilter/x86/vf_framerate.asm
@@ -0,0 +1,131 @@ 
+;*****************************************************************************
+;* x86-optimized functions for framerate filter
+;*
+;* Copyright (C) 2018 Marton Balint
+;*
+;* Based on vf_blend.asm, Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+
+%macro XSPLAT 3
+%if cpuflag(avx2)
+    vpbroadcast%3  %1, %2
+%else
+    movd           %1, %2
+%ifidn %3, d
+    SPLATD         %1
+%else
+    SPLATW         %1, %1
+%endif
+%endif
+%endmacro
+
+
+%macro BLEND_INIT 0-1
+%if ARCH_X86_64
+cglobal blend_frames%1, 6, 9, 5, src1, src1_linesize, src2, src2_linesize, dst, dst_linesize, width, end, x
+    mov    widthd, dword widthm
+%else
+cglobal blend_frames%1, 5, 7, 5, src1, src1_linesize, src2, src2_linesize, dst, end, x
+%define dst_linesizeq r5mp
+%define widthq r6mp
+%endif
+    mov      endd, dword r7m
+    add     src1q, widthq
+    add     src2q, widthq
+    add      dstq, widthq
+    neg    widthq
+%endmacro
+
+
+%macro BLEND_LOOP 4
+.nextrow:
+    mov        xq, widthq
+
+    .loop:
+        movu            m0, [src1q + xq]
+        movu            m1, [src2q + xq]
+        SBUTTERFLY    %1%2, 0, 1, 4         ; aAbBcCdD
+                                            ; eEfFgGhH
+        pmadd%3         m0, m2
+        pmadd%3         m1, m2
+
+        padd%2          m0, m3
+        padd%2          m1, m3
+        psrl%2          m0, %4              ; 0A0B0C0D
+        psrl%2          m1, %4              ; 0E0F0G0H
+
+        packus%2%1      m0, m1              ; ABCDEFGH
+        movu   [dstq + xq], m0
+        add             xq, mmsize
+    jl .loop
+    add     src1q, src1_linesizeq
+    add     src2q, src2_linesizeq
+    add      dstq, dst_linesizeq
+    sub      endd, 1
+    jg .nextrow
+REP_RET
+%endmacro
+
+
+%macro BLEND_FRAMES 0
+    BLEND_INIT
+
+    XSPLAT     m2, r8m, w                   ; factor1
+    XSPLAT     m3, r9m, w                   ; factor2
+
+    psllw      m3, 8
+    por        m2, m3                       ; interleaved factors
+
+    XSPLAT     m3, r10m, w                  ; half
+
+    BLEND_LOOP  b, w, ubsw, 7
+%endmacro
+
+
+%macro BLEND_FRAMES16 0
+    BLEND_INIT 16
+
+    XSPLAT     m2, r8m, d                   ; factor1
+    XSPLAT     m3, r9m, d                   ; factor2
+
+    pslld      m3, 16
+    por        m2, m3                       ; interleaved factors
+
+    XSPLAT     m3, r10m, d                  ; half
+
+    BLEND_LOOP  w, d, wd, 15
+%endmacro
+
+
+INIT_XMM ssse3
+BLEND_FRAMES
+
+INIT_YMM avx2
+BLEND_FRAMES
+
+INIT_XMM sse4
+BLEND_FRAMES16
+
+INIT_YMM avx2
+BLEND_FRAMES16