diff mbox

[FFmpeg-devel,3/3] avfilter/vf_framerate: add SIMD functions for frame blending

Message ID 20180113215739.29913-3-cus@passwd.hu
State Superseded
Headers show

Commit Message

Marton Balint Jan. 13, 2018, 9:57 p.m. UTC
Signed-off-by: Marton Balint <cus@passwd.hu>
---
 libavfilter/vf_framerate.c       |  29 ++++++--
 libavfilter/x86/Makefile         |   1 +
 libavfilter/x86/vf_framerate.asm | 141 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 5 deletions(-)
 create mode 100644 libavfilter/x86/vf_framerate.asm

Comments

Carl Eugen Hoyos Jan. 14, 2018, 3:12 a.m. UTC | #1
2018-01-13 22:57 GMT+01:00 Marton Balint <cus@passwd.hu>:
> Signed-off-by: Marton Balint <cus@passwd.hu>
> ---
>  libavfilter/vf_framerate.c       |  29 ++++++--
>  libavfilter/x86/Makefile         |   1 +
>  libavfilter/x86/vf_framerate.asm | 141 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 166 insertions(+), 5 deletions(-)
>  create mode 100644 libavfilter/x86/vf_framerate.asm

I believe you should add some numbers about the speedup.
(I prefer overall but usually per function are shown.)

Thank you, Carl Eugen
Henrik Gramner Jan. 14, 2018, 10:31 a.m. UTC | #2
On Sat, Jan 13, 2018 at 10:57 PM, Marton Balint <cus@passwd.hu> wrote:
> +    .loop:
> +        movu            m0, [src1q + xq]
> +        movu            m1, [src2q + xq]
> +        punpckl%1%2     m5, m0, m2         ; 0e0f0g0h
> +        punpckh%1%2     m0, m2             ; 0a0b0c0d
> +        punpckl%1%2     m6, m1, m2         ; 0E0F0G0H
> +        punpckh%1%2     m1, m2             ; 0A0B0C0D
> +        pmull%2         m0, m3
> +        pmull%2         m5, m3
> +        pmull%2         m1, m4
> +        pmull%2         m6, m4
> +        padd%2          m0, m7
> +        padd%2          m5, m7
> +        padd%2          m0, m1
> +        padd%2          m5, m6

pmaddubsw should work here for the 8-bit case. pmaddwd might work for
the 16-bit case depending on how many bits are actually used.

> +    pinsrw    xm3, r8m, 0                   ; factor1
> +    pinsrw    xm4, r9m, 0                   ; factor2
> +    pinsrw    xm7, r10m, 0                  ; half
> +    SPLATW     m3, xm3
> +    SPLATW     m4, xm4
> +    SPLATW     m7, xm7

vpbroadcast* from memory on avx2, otherwise movd instead of pxor+pinsrw.

> +    pxor       m3, m3
> +    pxor       m4, m4
> +    pxor       m7, m7
> +    pinsrw    xm3, r8m, 0                   ; factor1
> +    pinsrw    xm4, r9m, 0                   ; factor2
> +    pinsrw    xm7, r10m, 0                  ; half
> +    XSPLATD       3
> +    XSPLATD       4
> +    XSPLATD       7

Ditto.

> +    neg word r11m                           ; shift = -shift
> +    add word r11m, 16                       ; shift += 16
> +    pxor       m2, m2
> +    pinsrw    xm2, r11m, 0                  ; 16 - shift
> +    pslld      m3, xm2
> +    pslld      m4, xm2
> +    pslld      m7, xm2

You probably want to use a temporary register instead of doing slow
load-modify-store instructions.

Doing this in SIMD might be an option as well, e.g. load data directly
into vector regs from the stack, shift, then splat.
Marton Balint Jan. 14, 2018, 11:09 p.m. UTC | #3
On Sun, 14 Jan 2018, Henrik Gramner wrote:

> On Sat, Jan 13, 2018 at 10:57 PM, Marton Balint <cus@passwd.hu> wrote:
>> +    .loop:
>> +        movu            m0, [src1q + xq]
>> +        movu            m1, [src2q + xq]
>> +        punpckl%1%2     m5, m0, m2         ; 0e0f0g0h
>> +        punpckh%1%2     m0, m2             ; 0a0b0c0d
>> +        punpckl%1%2     m6, m1, m2         ; 0E0F0G0H
>> +        punpckh%1%2     m1, m2             ; 0A0B0C0D
>> +        pmull%2         m0, m3
>> +        pmull%2         m5, m3
>> +        pmull%2         m1, m4
>> +        pmull%2         m6, m4
>> +        padd%2          m0, m7
>> +        padd%2          m5, m7
>> +        padd%2          m0, m1
>> +        padd%2          m5, m6
>
> pmaddubsw should work here for the 8-bit case. pmaddwd might work for
> the 16-bit case depending on how many bits are actually used.
>

As far as I see, I have to make the blending factors 7-bit (15-bit) in 
order for this to work because pmadd* functions are working on signed 
integers. Losing 1 bit of precision of the blending factors is 
probably not a problem for the framerate filter.

So my loop would look like this:

     .loop:
         movu            m0, [src1q + xq]
         movu            m1, [src2q + xq]
         SBUTTERFLY     %1%2, 0, 1, 5        ; aAbBcCdD
                                             ; eEfFgGhH
         pmadd%3         m0, m3
         pmadd%3         m1, m3

         padd%2          m0, m7
         padd%2          m1, m7
         psrl%2          m0, %4              ; 0A0B0C0D
         psrl%2          m1, %4              ; 0E0F0G0H

         packus%2%1      m0, m1              ; ABCDEFGH
         movu   [dstq + xq], m0
         add             xq, mmsize
     jl .loop

Is this what you had in mind?

>> +    pinsrw    xm3, r8m, 0                   ; factor1
>> +    pinsrw    xm4, r9m, 0                   ; factor2
>> +    pinsrw    xm7, r10m, 0                  ; half
>> +    SPLATW     m3, xm3
>> +    SPLATW     m4, xm4
>> +    SPLATW     m7, xm7
>
> vpbroadcast* from memory on avx2, otherwise movd instead of pxor+pinsrw.
>
>> +    pxor       m3, m3
>> +    pxor       m4, m4
>> +    pxor       m7, m7
>> +    pinsrw    xm3, r8m, 0                   ; factor1
>> +    pinsrw    xm4, r9m, 0                   ; factor2
>> +    pinsrw    xm7, r10m, 0                  ; half
>> +    XSPLATD       3
>> +    XSPLATD       4
>> +    XSPLATD       7
>
> Ditto.
>
>> +    neg word r11m                           ; shift = -shift
>> +    add word r11m, 16                       ; shift += 16
>> +    pxor       m2, m2
>> +    pinsrw    xm2, r11m, 0                  ; 16 - shift
>> +    pslld      m3, xm2
>> +    pslld      m4, xm2
>> +    pslld      m7, xm2
>
> You probably want to use a temporary register instead of doing slow
> load-modify-store instructions.

Ok, I will rework these, although these parts are only the initialization 
code, so I guess these are not performance critical.

Thanks,
Marton
diff mbox

Patch

diff --git a/libavfilter/vf_framerate.c b/libavfilter/vf_framerate.c
index 758fa56339..6d946bda67 100644
--- a/libavfilter/vf_framerate.c
+++ b/libavfilter/vf_framerate.c
@@ -29,11 +29,13 @@ 
 #define DEBUG
 
 #include "libavutil/avassert.h"
+#include "libavutil/cpu.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/pixelutils.h"
+#include "libavutil/x86/cpu.h"
 
 #include "avfilter.h"
 #include "internal.h"
@@ -246,7 +248,7 @@  static int blend_frames(AVFilterContext *ctx, int interpolate)
         av_frame_copy_props(s->work, s->f0);
 
         ff_dlog(ctx, "blend_frames() INTERPOLATE to create work frame\n");
-        ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(outlink->h, ff_filter_get_nb_threads(ctx)));
+        ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(FFMAX(1, outlink->h >> 2), ff_filter_get_nb_threads(ctx)));
         return 1;
     }
     return 0;
@@ -350,6 +352,11 @@  static void blend_frames_c(BLEND_FUNC_PARAMS)
     }
 }
 
+void ff_blend_frames_sse2(BLEND_FUNC_PARAMS);
+void ff_blend_frames_avx2(BLEND_FUNC_PARAMS);
+void ff_blend_frames16_sse4(BLEND_FUNC_PARAMS);
+void ff_blend_frames16_avx2(BLEND_FUNC_PARAMS);
+
 static void blend_frames16_c(BLEND_FUNC_PARAMS)
 {
     int line, pixel;
@@ -374,6 +381,7 @@  static int config_input(AVFilterLink *inlink)
     AVFilterContext *ctx = inlink->dst;
     FrameRateContext *s = ctx->priv;
     const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
+    int cpu_flags = av_get_cpu_flags();
     int plane;
 
     for (plane = 0; plane < 4; plane++) {
@@ -393,10 +401,21 @@  static int config_input(AVFilterLink *inlink)
     s->srce_time_base = inlink->time_base;
 
     s->max = 1 << (s->bitdepth);
-    if (s->bitdepth == 8)
-        s->blend = blend_frames_c;
-    else
-        s->blend = blend_frames16_c;
+    if (s->bitdepth == 8) {
+        if (ARCH_X86 && EXTERNAL_AVX2(cpu_flags))
+            s->blend = ff_blend_frames_avx2;
+        else if (ARCH_X86 && EXTERNAL_SSE2(cpu_flags))
+            s->blend = ff_blend_frames_sse2;
+        else
+            s->blend = blend_frames_c;
+    } else {
+        if (ARCH_X86 && EXTERNAL_AVX2(cpu_flags))
+            s->blend = ff_blend_frames16_avx2;
+        else if (ARCH_X86 && EXTERNAL_SSE4(cpu_flags))
+            s->blend = ff_blend_frames16_sse4;
+        else
+            s->blend = blend_frames16_c;
+    }
 
     return 0;
 }
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 2fc5c62644..07fc0f9486 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -31,6 +31,7 @@  X86ASM-OBJS-$(CONFIG_AFIR_FILTER)            += x86/af_afir.o
 X86ASM-OBJS-$(CONFIG_BLEND_FILTER)           += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
+X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += x86/vf_framerate.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
 X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
diff --git a/libavfilter/x86/vf_framerate.asm b/libavfilter/x86/vf_framerate.asm
new file mode 100644
index 0000000000..7567763328
--- /dev/null
+++ b/libavfilter/x86/vf_framerate.asm
@@ -0,0 +1,141 @@ 
+;*****************************************************************************
+;* x86-optimized functions for framerate filter
+;*
+;* Copyright (C) 2018 Marton Balint
+;*
+;* Based on vf_blend.asm, Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+
+%macro XSPLATD 1
+%if cpuflag(avx2)
+    vpbroadcastd m%1, xm%1
+%else
+    SPLATD   m%1
+%endif
+%endmacro
+
+
+%macro BLEND_INIT 0-1
+%if ARCH_X86_64
+cglobal blend_frames%1, 6, 9, 8, src1, src1_linesize, src2, src2_linesize, dst, dst_linesize, width, end, x
+    mov      widthd, dword widthm
+%else
+cglobal blend_frames%1, 5, 7, 8, src1, src1_linesize, src2, src2_linesize, dst, end, x
+%define dst_linesizeq r5mp
+%define widthq r6mp
+%endif
+    mov      endd, dword r7m
+    add     src1q, widthq
+    add     src2q, widthq
+    add      dstq, widthq
+    neg    widthq
+%endmacro
+
+
+%macro BLEND_LOOP 3
+    pxor       m2, m2
+.nextrow:
+    mov        xq, widthq
+
+    .loop:
+        movu            m0, [src1q + xq]
+        movu            m1, [src2q + xq]
+        punpckl%1%2     m5, m0, m2         ; 0e0f0g0h
+        punpckh%1%2     m0, m2             ; 0a0b0c0d
+        punpckl%1%2     m6, m1, m2         ; 0E0F0G0H
+        punpckh%1%2     m1, m2             ; 0A0B0C0D
+        pmull%2         m0, m3
+        pmull%2         m5, m3
+        pmull%2         m1, m4
+        pmull%2         m6, m4
+        padd%2          m0, m7
+        padd%2          m5, m7
+        padd%2          m0, m1
+        padd%2          m5, m6
+        psrl%2          m0, %3              ; 0A0B0C0D
+        psrl%2          m5, %3              ; 0E0F0G0H
+
+        packus%2%1      m5, m0              ; ABCDEFGH
+        movu   [dstq + xq], m5
+        add             xq, mmsize
+    jl .loop
+    add         src1q, src1_linesizeq
+    add         src2q, src2_linesizeq
+    add          dstq, dst_linesizeq
+    sub          endd, 1
+    jg .nextrow
+REP_RET
+%endmacro
+
+
+%macro BLEND_FRAMES 0
+    BLEND_INIT
+
+    pinsrw    xm3, r8m, 0                   ; factor1
+    pinsrw    xm4, r9m, 0                   ; factor2
+    pinsrw    xm7, r10m, 0                  ; half
+    SPLATW     m3, xm3
+    SPLATW     m4, xm4
+    SPLATW     m7, xm7
+
+    BLEND_LOOP  b, w, 8
+%endmacro
+
+
+%macro BLEND_FRAMES16 0
+    BLEND_INIT 16
+
+    pxor       m3, m3
+    pxor       m4, m4
+    pxor       m7, m7
+    pinsrw    xm3, r8m, 0                   ; factor1
+    pinsrw    xm4, r9m, 0                   ; factor2
+    pinsrw    xm7, r10m, 0                  ; half
+
+    XSPLATD       3
+    XSPLATD       4
+    XSPLATD       7
+
+    neg word r11m                           ; shift = -shift
+    add word r11m, 16                       ; shift += 16
+    pxor       m2, m2
+    pinsrw    xm2, r11m, 0                  ; 16 - shift
+    pslld      m3, xm2
+    pslld      m4, xm2
+    pslld      m7, xm2
+
+    BLEND_LOOP  w, d, 16
+%endmacro
+
+INIT_XMM sse2
+BLEND_FRAMES
+
+INIT_YMM avx2
+BLEND_FRAMES
+
+INIT_XMM sse4
+BLEND_FRAMES16
+
+INIT_YMM avx2
+BLEND_FRAMES16