diff mbox

[FFmpeg-devel] avfilter/vf_gblur: add x86 SIMD optimizations

Message ID 20190530044309.32607-1-ruiling.song@intel.com
State Superseded
Headers show

Commit Message

Ruiling Song May 30, 2019, 4:43 a.m. UTC
For details of the implementation, please refer to the comment
inlined in the assembly code. It improves the horizontal pass
performance about 100% under single thread.

Tested overall performance using the command(avx2 enabled):
./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null
./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null
For single thread, the fps improves from 43 to 60, about 40%.
For multi-thread, the fps improves from 110 to 130, about 20%.

Signed-off-by: Ruiling Song <ruiling.song@intel.com>
---
 libavfilter/gblur.h             |  54 ++++++++++
 libavfilter/vf_gblur.c          |  66 +++++-------
 libavfilter/x86/Makefile        |   2 +
 libavfilter/x86/vf_gblur.asm    | 182 ++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_gblur_init.c |  36 +++++++
 5 files changed, 302 insertions(+), 38 deletions(-)
 create mode 100644 libavfilter/gblur.h
 create mode 100644 libavfilter/x86/vf_gblur.asm
 create mode 100644 libavfilter/x86/vf_gblur_init.c

Comments

Paul B Mahol May 30, 2019, 7:23 a.m. UTC | #1
On 5/30/19, Ruiling Song <ruiling.song@intel.com> wrote:
> For details of the implementation, please refer to the comment
> inlined in the assembly code. It improves the horizontal pass
> performance about 100% under single thread.
>
> Tested overall performance using the command(avx2 enabled):
> ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null
> ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null
> For single thread, the fps improves from 43 to 60, about 40%.
> For multi-thread, the fps improves from 110 to 130, about 20%.
>
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---
>  libavfilter/gblur.h             |  54 ++++++++++
>  libavfilter/vf_gblur.c          |  66 +++++-------
>  libavfilter/x86/Makefile        |   2 +
>  libavfilter/x86/vf_gblur.asm    | 182 ++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_gblur_init.c |  36 +++++++
>  5 files changed, 302 insertions(+), 38 deletions(-)
>  create mode 100644 libavfilter/gblur.h
>  create mode 100644 libavfilter/x86/vf_gblur.asm
>  create mode 100644 libavfilter/x86/vf_gblur_init.c
>
> diff --git a/libavfilter/gblur.h b/libavfilter/gblur.h
> new file mode 100644
> index 0000000000..97217044d0
> --- /dev/null
> +++ b/libavfilter/gblur.h
> @@ -0,0 +1,54 @@
> +/*
> + * Copyright (c) 2011 Pascal Getreuer
> + * Copyright (c) 2016 Paul B Mahol
> + *
> + * Redistribution and use in source and binary forms, with or without
> modification,
> + * are permitted provided that the following conditions are met:
> + *
> + *  * Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + *  * Redistributions in binary form must reproduce the above
> + *    copyright notice, this list of conditions and the following
> + *    disclaimer in the documentation and/or other materials provided
> + *    with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef AVFILTER_GBLUR_H
> +#define AVFILTER_GBLUR_H
> +#include "avfilter.h"
> +
> +typedef struct GBlurContext {
> +    const AVClass *class;
> +
> +    float sigma;
> +    float sigmaV;
> +    int steps;
> +    int planes;
> +
> +    int depth;
> +    int planewidth[4];
> +    int planeheight[4];
> +    float *buffer;
> +    float boundaryscale;
> +    float boundaryscaleV;
> +    float postscale;
> +    float postscaleV;
> +    float nu;
> +    float nuV;
> +    int nb_planes;
> +    void (*horiz_slice)(float *buffer, int width, int height, int steps,
> float nu, float bscale);
> +} GBlurContext;
> +void ff_gblur_init_x86(GBlurContext *s);
> +#endif
> diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c
> index b91a8c074a..4e876bca05 100644
> --- a/libavfilter/vf_gblur.c
> +++ b/libavfilter/vf_gblur.c
> @@ -30,29 +30,11 @@
>  #include "libavutil/pixdesc.h"
>  #include "avfilter.h"
>  #include "formats.h"
> +#include "gblur.h"
>  #include "internal.h"
>  #include "video.h"
> +#include <immintrin.h>

Is this header really needed?

>
> -typedef struct GBlurContext {
> -    const AVClass *class;
> -
> -    float sigma;
> -    float sigmaV;
> -    int steps;
> -    int planes;
> -
> -    int depth;
> -    int planewidth[4];
> -    int planeheight[4];
> -    float *buffer;
> -    float boundaryscale;
> -    float boundaryscaleV;
> -    float postscale;
> -    float postscaleV;
> -    float nu;
> -    float nuV;
> -    int nb_planes;
> -} GBlurContext;
>
>  #define OFFSET(x) offsetof(GBlurContext, x)
>  #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
> @@ -72,39 +54,44 @@ typedef struct ThreadData {
>      int width;
>  } ThreadData;
>
> -static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr,
> int nb_jobs)
> +static void horiz_slice_c(float *buffer, int width, int height, int steps,
> +                          float nu, float bscale)
>  {
> -    GBlurContext *s = ctx->priv;
> -    ThreadData *td = arg;
> -    const int height = td->height;
> -    const int width = td->width;
> -    const int slice_start = (height *  jobnr   ) / nb_jobs;
> -    const int slice_end   = (height * (jobnr+1)) / nb_jobs;
> -    const float boundaryscale = s->boundaryscale;
> -    const int steps = s->steps;
> -    const float nu = s->nu;
> -    float *buffer = s->buffer;
> -    int y, x, step;
> +    int step, x, y;
>      float *ptr;
> -
> -    /* Filter horizontally along each row */
> -    for (y = slice_start; y < slice_end; y++) {
> +    for (y = 0; y < height; y++) {
>          for (step = 0; step < steps; step++) {
>              ptr = buffer + width * y;
> -            ptr[0] *= boundaryscale;
> +            ptr[0] *= bscale;
>
>              /* Filter rightwards */
>              for (x = 1; x < width; x++)
>                  ptr[x] += nu * ptr[x - 1];
> -
> -            ptr[x = width - 1] *= boundaryscale;
> +            ptr[x = width - 1] *= bscale;
>
>              /* Filter leftwards */
>              for (; x > 0; x--)
>                  ptr[x - 1] += nu * ptr[x];
>          }
>      }
> +}
> +
> +static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr,
> int nb_jobs)
> +{
> +    GBlurContext *s = ctx->priv;
> +    ThreadData *td = arg;
> +    const int height = td->height;
> +    const int width = td->width;
> +    const int slice_start = (height *  jobnr   ) / nb_jobs;
> +    const int slice_end   = (height * (jobnr+1)) / nb_jobs;
> +    const float boundaryscale = s->boundaryscale;
> +    const int steps = s->steps;
> +    const float nu = s->nu;
> +    float *buffer = s->buffer;
>
> +    s->horiz_slice(buffer + width * slice_start, width, slice_end -
> slice_start,
> +                   steps, nu, boundaryscale);
> +    emms_c();
>      return 0;
>  }
>
> @@ -251,6 +238,9 @@ static int config_input(AVFilterLink *inlink)
>      if (s->sigmaV < 0) {
>          s->sigmaV = s->sigma;
>      }
> +    s->horiz_slice = horiz_slice_c;
> +    if (ARCH_X86_64)
> +        ff_gblur_init_x86(s);
>
>      return 0;
>  }
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 17499f14da..6b0361bed2 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -7,6 +7,7 @@ OBJS-$(CONFIG_BWDIF_FILTER)                  +=
> x86/vf_bwdif_init.o
>  OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
>  OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
>  OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
> +OBJS-$(CONFIG_GBLUR_FILTER)                  += x86/vf_gblur_init.o
>  OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
>  OBJS-$(CONFIG_FRAMERATE_FILTER)              += x86/vf_framerate_init.o
>  OBJS-$(CONFIG_HFLIP_FILTER)                  += x86/vf_hflip_init.o
> @@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           +=
> x86/vf_bwdif.o
>  X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
>  X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += x86/vf_framerate.o
>  X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
> +X86ASM-OBJS-$(CONFIG_GBLUR_FILTER)           += x86/vf_gblur.o
>  X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
>  X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
>  X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
> new file mode 100644
> index 0000000000..79e56a32a7
> --- /dev/null
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -0,0 +1,182 @@
> +;*****************************************************************************
> +;* x86-optimized functions for gblur filter
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
> +;                          float nu, float bscale)
> +
> +%macro HORIZ_SLICE 0
> +%if UNIX64
> +cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step,
> stride, remain
> +%else
> +cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y,
> step, stride, remain
> +%endif
> +%if WIN64
> +    movss m0, num
> +    movss m1, bscalem
> +    DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
> +%endif
> +    mulss m2, m0, m0 ; nu ^ 2
> +    mulss m3, m2, m0 ; nu ^ 3
> +    mulss m4, m3, m0 ; nu ^ 4
> +    xor   xq, xq
> +    xor   yq, yq
> +    xor   stepq, stepq
> +    mov   strideq, widthq
> +    ; stride = width * 4
> +    shl   strideq, 2
> +    ; w = w - ((w - 1) & 3)
> +    mov   remainq, widthq
> +    sub   remainq, 1
> +    and   remainq, 3
> +    sub   widthq, remainq
> +
> +    shufps m0, m0, 0
> +    shufps m2, m2, 0
> +    shufps m3, m3, 0
> +    shufps m4, m4, 0
> +
> +.loop_y:
> +    .loop_step:
> +        ; p0 *= bscale
> +        mulss m5, m1, [ptrq + xq * 4]
> +        movss [ptrq + xq * 4], m5
> +        inc xq
> +        ; filter rightwards
> +        ; Here we are vectorizing the c version by 4
> +        ;    for (x = 1; x < width; x++)
> +        ;       ptr[x] += nu * ptr[x - 1];
> +        ;   let p0 stands for ptr[x-1], the data from last loop
> +        ;   and [p1,p2,p3,p4] be the vector data for this loop.
> +        ; Unrolling the loop, we get:
> +        ;   p1' = p1 + p0*nu
> +        ;   p2' = p2 + p1*nu + p0*nu^2
> +        ;   p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
> +        ;   p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
> +        ; so we can do it in simd:
> +        ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
> +        ;                     [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
> +        ;                     [0,0,0,p0]*nu^4
> +
> +        .loop_x:
> +            movu m6, [ptrq + xq * 4]         ; s  = [p1,p2,p3,p4]
> +            pslldq m7, m6, 4                 ;      [0, p1,p2,p3]
> +            movss  m7, m5                    ;      [p0,p1,p2,p3]
> +            FMULADD_PS  m6, m7, m0, m6, m8   ; s += [p0,p1,p2,p3] * nu
> +            pslldq m7, 4                     ;      [0,p0,p1,p2]
> +            FMULADD_PS  m6, m7, m2, m6, m8   ; s += [0,p0,p1,p2]  * nu^2
> +            pslldq m7, 4
> +            FMULADD_PS  m6, m7, m3, m6, m8   ; s += [0,0,p0,p1]   * nu^3
> +            pslldq m7, 4
> +            FMULADD_PS  m6, m7, m4, m6, m8   ; s += [0,0,0,p0]    * nu^4
> +            movu [ptrq + xq * 4], m6
> +            shufps m5, m6, m6, q3333
> +            add xq, 4
> +            cmp xq, widthq
> +            jl .loop_x
> +
> +        add widthq, remainq
> +        cmp xq, widthq
> +        je .end_scalar
> +
> +        .loop_scalar:
> +            ; ptr[x] += nu * ptr[x-1]
> +            movss m5, [ptrq + 4*xq - 4]
> +            mulss m5, m0
> +            addss m5, [ptrq + 4*xq]
> +            movss [ptrq + 4*xq], m5
> +            inc xq
> +            cmp xq, widthq
> +            jl .loop_scalar
> +        .end_scalar:
> +            ; ptr[width - 1] *= bscale
> +            dec xq
> +            mulss m5, m1, [ptrq + 4*xq]
> +            movss [ptrq + 4*xq], m5
> +            shufps m5, m5, 0
> +
> +        ; filter leftwards
> +        ;    for (; x > 0; x--)
> +        ;        ptr[x - 1] += nu * ptr[x];
> +        ; The idea here is basically the same as filter rightwards.
> +        ; But we need to take care as the data layout is different.
> +        ; Let p0 stands for the ptr[x], which is the data from last loop.
> +        ; The way we do it in simd as below:
> +        ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
> +        ;                          + [p-3, p-2, p-1, p0] * nu
> +        ;                          + [p-2, p-1, p0,  0]  * nu^2
> +        ;                          + [p-1, p0,  0,   0]  * nu^3
> +        ;                          + [p0,  0,   0,   0]  * nu^4
> +        .loop_x_back:
> +            sub xq, 4
> +            movu m6, [ptrq + xq * 4]      ; s = [p-4, p-3, p-2, p-1]
> +            psrldq m7, m6, 4              ;     [p-3, p-2, p-1, 0  ]
> +            blendps m7, m5, 0x8           ;     [p-3, p-2, p-1, p0 ]
> +            FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
> +            psrldq m7, 4                  ;
> +            FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0,  0] * nu^2
> +            psrldq m7, 4
> +            FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0,   0,  0] * nu^3
> +            psrldq m7, 4
> +            FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0,  0,    0,  0] * nu^4
> +            movu [ptrq + xq * 4], m6
> +            shufps m5, m6, m6, 0          ; m5 = [p-4', p-4', p-4', p-4']
> +            cmp xq, remainq
> +            jg .loop_x_back
> +
> +        cmp xq, 0
> +        je .end_scalar_back
> +
> +        .loop_scalar_back:
> +            ; ptr[x-1] += nu * ptr[x]
> +            movss m5, [ptrq + 4*xq]
> +            mulss m5, m0
> +            addss m5, [ptrq + 4*xq - 4]
> +            movss [ptrq + 4*xq - 4], m5
> +            dec xq
> +            cmp xq, 0
> +            jg .loop_scalar_back
> +        .end_scalar_back:
> +
> +        ; reset aligned width for next line
> +        sub widthq, remainq
> +
> +        inc stepq
> +        cmp stepq, stepsq
> +        jl .loop_step
> +
> +    add ptrq, strideq
> +    inc yq
> +    cmp yq, heightq
> +    jl .loop_y
> +
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +INIT_XMM sse4
> +HORIZ_SLICE
> +
> +INIT_XMM avx2
> +HORIZ_SLICE
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c
> b/libavfilter/x86/vf_gblur_init.c
> new file mode 100644
> index 0000000000..b068edc598
> --- /dev/null
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -0,0 +1,36 @@
> +/*
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/gblur.h"
> +
> +void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
> float nu, float bscale);
> +void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps,
> float nu, float bscale);
> +
> +av_cold void ff_gblur_init_x86(GBlurContext *s)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE4(cpu_flags))
> +        s->horiz_slice = ff_horiz_slice_sse4;
> +    if (EXTERNAL_AVX2(cpu_flags))
> +        s->horiz_slice = ff_horiz_slice_avx2;
> +}
> --
> 2.17.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Ruiling Song May 30, 2019, 7:29 a.m. UTC | #2
> -----Original Message-----

> From: Paul B Mahol [mailto:onemda@gmail.com]

> Sent: Thursday, May 30, 2019 3:24 PM

> To: FFmpeg development discussions and patches <ffmpeg-

> devel@ffmpeg.org>

> Cc: Song, Ruiling <ruiling.song@intel.com>

> Subject: Re: [FFmpeg-devel] [PATCH] avfilter/vf_gblur: add x86 SIMD

> optimizations

> 

> On 5/30/19, Ruiling Song <ruiling.song@intel.com> wrote:

> > For details of the implementation, please refer to the comment

> > inlined in the assembly code. It improves the horizontal pass

> > performance about 100% under single thread.

> >

> > Tested overall performance using the command(avx2 enabled):

> > ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null

> > ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null

> > For single thread, the fps improves from 43 to 60, about 40%.

> > For multi-thread, the fps improves from 110 to 130, about 20%.

> >

> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > ---

> >  libavfilter/gblur.h             |  54 ++++++++++

> >  libavfilter/vf_gblur.c          |  66 +++++-------

> >  libavfilter/x86/Makefile        |   2 +

> >  libavfilter/x86/vf_gblur.asm    | 182

> ++++++++++++++++++++++++++++++++

> >  libavfilter/x86/vf_gblur_init.c |  36 +++++++

> >  5 files changed, 302 insertions(+), 38 deletions(-)

> >  create mode 100644 libavfilter/gblur.h

> >  create mode 100644 libavfilter/x86/vf_gblur.asm

> >  create mode 100644 libavfilter/x86/vf_gblur_init.c


[...]
> > diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c

> > index b91a8c074a..4e876bca05 100644

> > --- a/libavfilter/vf_gblur.c

> > +++ b/libavfilter/vf_gblur.c

> > @@ -30,29 +30,11 @@

> >  #include "libavutil/pixdesc.h"

> >  #include "avfilter.h"

> >  #include "formats.h"

> > +#include "gblur.h"

> >  #include "internal.h"

> >  #include "video.h"

> > +#include <immintrin.h>

> 

> Is this header really needed?

Oh, this is not needed, I forget to remove it after I am experimenting with SSE intrinsics.
Will remove it. Thanks!

Ruiling
Carl Eugen Hoyos May 31, 2019, 10:11 p.m. UTC | #3
Am Do., 30. Mai 2019 um 05:46 Uhr schrieb Ruiling Song <ruiling.song@intel.com>:
>
> For details of the implementation, please refer to the comment
> inlined in the assembly code.

This sentence sounds unneeded to me.

> It improves the horizontal pass
> performance about 100% under single thread.

I am not a native speaker but I wonder what a "100% speed
improvement" could mean...

> Tested overall performance using the command(avx2 enabled):
> ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null
> ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null
> For single thread, the fps improves from 43 to 60, about 40%.
> For multi-thread, the fps improves from 110 to 130, about 20%.

Carl Eugen
Ruiling Song June 1, 2019, 1:37 p.m. UTC | #4
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf

> Of Carl Eugen Hoyos

> Sent: Saturday, June 1, 2019 6:12 AM

> To: FFmpeg development discussions and patches <ffmpeg-

> devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH] avfilter/vf_gblur: add x86 SIMD

> optimizations

> 

> Am Do., 30. Mai 2019 um 05:46 Uhr schrieb Ruiling Song

> <ruiling.song@intel.com>:

> >

> > For details of the implementation, please refer to the comment

> > inlined in the assembly code.

> 

> This sentence sounds unneeded to me.

> 

> > It improves the horizontal pass

> > performance about 100% under single thread.

> 

> I am not a native speaker but I wonder what a "100% speed

> improvement" could mean...

It means 50% reduction of running time.
For example, previously it takes 12ms to do one horizontal pass per frame, now it takes 6ms to do the horizontal pass per frame.
Any comments on the assembly code?

> 

> > Tested overall performance using the command(avx2 enabled):

> > ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null

> > ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null

> > For single thread, the fps improves from 43 to 60, about 40%.

> > For multi-thread, the fps improves from 110 to 130, about 20%.

> 

> Carl Eugen

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavfilter/gblur.h b/libavfilter/gblur.h
new file mode 100644
index 0000000000..97217044d0
--- /dev/null
+++ b/libavfilter/gblur.h
@@ -0,0 +1,54 @@ 
+/*
+ * Copyright (c) 2011 Pascal Getreuer
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef AVFILTER_GBLUR_H
+#define AVFILTER_GBLUR_H
+#include "avfilter.h"
+
+typedef struct GBlurContext {
+    const AVClass *class;
+
+    float sigma;
+    float sigmaV;
+    int steps;
+    int planes;
+
+    int depth;
+    int planewidth[4];
+    int planeheight[4];
+    float *buffer;
+    float boundaryscale;
+    float boundaryscaleV;
+    float postscale;
+    float postscaleV;
+    float nu;
+    float nuV;
+    int nb_planes;
+    void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale);
+} GBlurContext;
+void ff_gblur_init_x86(GBlurContext *s);
+#endif
diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c
index b91a8c074a..4e876bca05 100644
--- a/libavfilter/vf_gblur.c
+++ b/libavfilter/vf_gblur.c
@@ -30,29 +30,11 @@ 
 #include "libavutil/pixdesc.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "gblur.h"
 #include "internal.h"
 #include "video.h"
+#include <immintrin.h>
 
-typedef struct GBlurContext {
-    const AVClass *class;
-
-    float sigma;
-    float sigmaV;
-    int steps;
-    int planes;
-
-    int depth;
-    int planewidth[4];
-    int planeheight[4];
-    float *buffer;
-    float boundaryscale;
-    float boundaryscaleV;
-    float postscale;
-    float postscaleV;
-    float nu;
-    float nuV;
-    int nb_planes;
-} GBlurContext;
 
 #define OFFSET(x) offsetof(GBlurContext, x)
 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
@@ -72,39 +54,44 @@  typedef struct ThreadData {
     int width;
 } ThreadData;
 
-static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+static void horiz_slice_c(float *buffer, int width, int height, int steps,
+                          float nu, float bscale)
 {
-    GBlurContext *s = ctx->priv;
-    ThreadData *td = arg;
-    const int height = td->height;
-    const int width = td->width;
-    const int slice_start = (height *  jobnr   ) / nb_jobs;
-    const int slice_end   = (height * (jobnr+1)) / nb_jobs;
-    const float boundaryscale = s->boundaryscale;
-    const int steps = s->steps;
-    const float nu = s->nu;
-    float *buffer = s->buffer;
-    int y, x, step;
+    int step, x, y;
     float *ptr;
-
-    /* Filter horizontally along each row */
-    for (y = slice_start; y < slice_end; y++) {
+    for (y = 0; y < height; y++) {
         for (step = 0; step < steps; step++) {
             ptr = buffer + width * y;
-            ptr[0] *= boundaryscale;
+            ptr[0] *= bscale;
 
             /* Filter rightwards */
             for (x = 1; x < width; x++)
                 ptr[x] += nu * ptr[x - 1];
-
-            ptr[x = width - 1] *= boundaryscale;
+            ptr[x = width - 1] *= bscale;
 
             /* Filter leftwards */
             for (; x > 0; x--)
                 ptr[x - 1] += nu * ptr[x];
         }
     }
+}
+
+static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    GBlurContext *s = ctx->priv;
+    ThreadData *td = arg;
+    const int height = td->height;
+    const int width = td->width;
+    const int slice_start = (height *  jobnr   ) / nb_jobs;
+    const int slice_end   = (height * (jobnr+1)) / nb_jobs;
+    const float boundaryscale = s->boundaryscale;
+    const int steps = s->steps;
+    const float nu = s->nu;
+    float *buffer = s->buffer;
 
+    s->horiz_slice(buffer + width * slice_start, width, slice_end - slice_start,
+                   steps, nu, boundaryscale);
+    emms_c();
     return 0;
 }
 
@@ -251,6 +238,9 @@  static int config_input(AVFilterLink *inlink)
     if (s->sigmaV < 0) {
         s->sigmaV = s->sigma;
     }
+    s->horiz_slice = horiz_slice_c;
+    if (ARCH_X86_64)
+        ff_gblur_init_x86(s);
 
     return 0;
 }
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 17499f14da..6b0361bed2 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -7,6 +7,7 @@  OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
+OBJS-$(CONFIG_GBLUR_FILTER)                  += x86/vf_gblur_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_FRAMERATE_FILTER)              += x86/vf_framerate_init.o
 OBJS-$(CONFIG_HFLIP_FILTER)                  += x86/vf_hflip_init.o
@@ -41,6 +42,7 @@  X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
 X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER)       += x86/vf_framerate.o
 X86ASM-OBJS-$(CONFIG_FSPP_FILTER)            += x86/vf_fspp.o
+X86ASM-OBJS-$(CONFIG_GBLUR_FILTER)           += x86/vf_gblur.o
 X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER)         += x86/vf_gradfun.o
 X86ASM-OBJS-$(CONFIG_HFLIP_FILTER)           += x86/vf_hflip.o
 X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
new file mode 100644
index 0000000000..79e56a32a7
--- /dev/null
+++ b/libavfilter/x86/vf_gblur.asm
@@ -0,0 +1,182 @@ 
+;*****************************************************************************
+;* x86-optimized functions for gblur filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
+;                          float nu, float bscale)
+
+%macro HORIZ_SLICE 0
+%if UNIX64
+cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain
+%else
+cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain
+%endif
+%if WIN64
+    movss m0, num
+    movss m1, bscalem
+    DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
+%endif
+    mulss m2, m0, m0 ; nu ^ 2
+    mulss m3, m2, m0 ; nu ^ 3
+    mulss m4, m3, m0 ; nu ^ 4
+    xor   xq, xq
+    xor   yq, yq
+    xor   stepq, stepq
+    mov   strideq, widthq
+    ; stride = width * 4
+    shl   strideq, 2
+    ; w = w - ((w - 1) & 3)
+    mov   remainq, widthq
+    sub   remainq, 1
+    and   remainq, 3
+    sub   widthq, remainq
+
+    shufps m0, m0, 0
+    shufps m2, m2, 0
+    shufps m3, m3, 0
+    shufps m4, m4, 0
+
+.loop_y:
+    .loop_step:
+        ; p0 *= bscale
+        mulss m5, m1, [ptrq + xq * 4]
+        movss [ptrq + xq * 4], m5
+        inc xq
+        ; filter rightwards
+        ; Here we are vectorizing the c version by 4
+        ;    for (x = 1; x < width; x++)
+        ;       ptr[x] += nu * ptr[x - 1];
+        ;   let p0 stands for ptr[x-1], the data from last loop
+        ;   and [p1,p2,p3,p4] be the vector data for this loop.
+        ; Unrolling the loop, we get:
+        ;   p1' = p1 + p0*nu
+        ;   p2' = p2 + p1*nu + p0*nu^2
+        ;   p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
+        ;   p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
+        ; so we can do it in simd:
+        ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
+        ;                     [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
+        ;                     [0,0,0,p0]*nu^4
+
+        .loop_x:
+            movu m6, [ptrq + xq * 4]         ; s  = [p1,p2,p3,p4]
+            pslldq m7, m6, 4                 ;      [0, p1,p2,p3]
+            movss  m7, m5                    ;      [p0,p1,p2,p3]
+            FMULADD_PS  m6, m7, m0, m6, m8   ; s += [p0,p1,p2,p3] * nu
+            pslldq m7, 4                     ;      [0,p0,p1,p2]
+            FMULADD_PS  m6, m7, m2, m6, m8   ; s += [0,p0,p1,p2]  * nu^2
+            pslldq m7, 4
+            FMULADD_PS  m6, m7, m3, m6, m8   ; s += [0,0,p0,p1]   * nu^3
+            pslldq m7, 4
+            FMULADD_PS  m6, m7, m4, m6, m8   ; s += [0,0,0,p0]    * nu^4
+            movu [ptrq + xq * 4], m6
+            shufps m5, m6, m6, q3333
+            add xq, 4
+            cmp xq, widthq
+            jl .loop_x
+
+        add widthq, remainq
+        cmp xq, widthq
+        je .end_scalar
+
+        .loop_scalar:
+            ; ptr[x] += nu * ptr[x-1]
+            movss m5, [ptrq + 4*xq - 4]
+            mulss m5, m0
+            addss m5, [ptrq + 4*xq]
+            movss [ptrq + 4*xq], m5
+            inc xq
+            cmp xq, widthq
+            jl .loop_scalar
+        .end_scalar:
+            ; ptr[width - 1] *= bscale
+            dec xq
+            mulss m5, m1, [ptrq + 4*xq]
+            movss [ptrq + 4*xq], m5
+            shufps m5, m5, 0
+
+        ; filter leftwards
+        ;    for (; x > 0; x--)
+        ;        ptr[x - 1] += nu * ptr[x];
+        ; The idea here is basically the same as filter rightwards.
+        ; But we need to take care as the data layout is different.
+        ; Let p0 stands for the ptr[x], which is the data from last loop.
+        ; The way we do it in simd as below:
+        ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
+        ;                          + [p-3, p-2, p-1, p0] * nu
+        ;                          + [p-2, p-1, p0,  0]  * nu^2
+        ;                          + [p-1, p0,  0,   0]  * nu^3
+        ;                          + [p0,  0,   0,   0]  * nu^4
+        .loop_x_back:
+            sub xq, 4
+            movu m6, [ptrq + xq * 4]      ; s = [p-4, p-3, p-2, p-1]
+            psrldq m7, m6, 4              ;     [p-3, p-2, p-1, 0  ]
+            blendps m7, m5, 0x8           ;     [p-3, p-2, p-1, p0 ]
+            FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
+            psrldq m7, 4                  ;
+            FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0,  0] * nu^2
+            psrldq m7, 4
+            FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0,   0,  0] * nu^3
+            psrldq m7, 4
+            FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0,  0,    0,  0] * nu^4
+            movu [ptrq + xq * 4], m6
+            shufps m5, m6, m6, 0          ; m5 = [p-4', p-4', p-4', p-4']
+            cmp xq, remainq
+            jg .loop_x_back
+
+        cmp xq, 0
+        je .end_scalar_back
+
+        .loop_scalar_back:
+            ; ptr[x-1] += nu * ptr[x]
+            movss m5, [ptrq + 4*xq]
+            mulss m5, m0
+            addss m5, [ptrq + 4*xq - 4]
+            movss [ptrq + 4*xq - 4], m5
+            dec xq
+            cmp xq, 0
+            jg .loop_scalar_back
+        .end_scalar_back:
+
+        ; reset aligned width for next line
+        sub widthq, remainq
+
+        inc stepq
+        cmp stepq, stepsq
+        jl .loop_step
+
+    add ptrq, strideq
+    inc yq
+    cmp yq, heightq
+    jl .loop_y
+
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+HORIZ_SLICE
+
+INIT_XMM avx2
+HORIZ_SLICE
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
new file mode 100644
index 0000000000..b068edc598
--- /dev/null
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -0,0 +1,36 @@ 
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/gblur.h"
+
+void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
+void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
+
+av_cold void ff_gblur_init_x86(GBlurContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        s->horiz_slice = ff_horiz_slice_sse4;
+    if (EXTERNAL_AVX2(cpu_flags))
+        s->horiz_slice = ff_horiz_slice_avx2;
+}