diff mbox series

[FFmpeg-devel,v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI

Message ID 20221104082925.25598-1-bin.wang@intel.com
State Accepted
Commit 3ab11dc5bb6eec9b645da45fe28b1b2c29e92eed
Headers show
Series [FFmpeg-devel,v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Wang, Bin Nov. 4, 2022, 8:29 a.m. UTC
From: bwang30 <bin.wang@intel.com>

This commit enabled assembly code with intel AVX512 VNNI and added unit test for sobel filter

sobel_c: 4537
sobel_avx512icl 2136

Signed-off-by: bwang30 <bin.wang@intel.com>
---
 libavfilter/convolution.h             |  74 +++++++++++++
 libavfilter/vf_convolution.c          |  91 +++-------------
 libavfilter/x86/vf_convolution.asm    | 147 ++++++++++++++++++++++++++
 libavfilter/x86/vf_convolution_init.c |  18 ++++
 tests/checkasm/Makefile               |   1 +
 tests/checkasm/checkasm.c             |   3 +
 tests/checkasm/checkasm.h             |   1 +
 tests/checkasm/vf_convolution.c       | 104 ++++++++++++++++++
 tests/fate/checkasm.mak               |   1 +
 9 files changed, 362 insertions(+), 78 deletions(-)
 create mode 100644 tests/checkasm/vf_convolution.c

Comments

Xiang, Haihao Nov. 7, 2022, 5:24 a.m. UTC | #1
On Fri, 2022-11-04 at 16:29 +0800, bin.wang-at-intel.com@ffmpeg.org wrote:
> From: bwang30 <bin.wang@intel.com>
> 
> This commit enabled assembly code with intel AVX512 VNNI and added unit test
> for sobel filter
> 
> sobel_c: 4537
> sobel_avx512icl 2136
> 
> Signed-off-by: bwang30 <bin.wang@intel.com>
> ---
>  libavfilter/convolution.h             |  74 +++++++++++++
>  libavfilter/vf_convolution.c          |  91 +++-------------
>  libavfilter/x86/vf_convolution.asm    | 147 ++++++++++++++++++++++++++
>  libavfilter/x86/vf_convolution_init.c |  18 ++++
>  tests/checkasm/Makefile               |   1 +
>  tests/checkasm/checkasm.c             |   3 +
>  tests/checkasm/checkasm.h             |   1 +
>  tests/checkasm/vf_convolution.c       | 104 ++++++++++++++++++
>  tests/fate/checkasm.mak               |   1 +
>  9 files changed, 362 insertions(+), 78 deletions(-)
>  create mode 100644 tests/checkasm/vf_convolution.c
> 
> diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h
> index 88aabe9a20..e44bfb5da8 100644
> --- a/libavfilter/convolution.h
> +++ b/libavfilter/convolution.h
> @@ -21,6 +21,7 @@
>  #ifndef AVFILTER_CONVOLUTION_H
>  #define AVFILTER_CONVOLUTION_H
>  #include "avfilter.h"
> +#include "libavutil/intreadwrite.h"
>  
>  enum MatrixMode {
>      MATRIX_SQUARE,
> @@ -61,4 +62,77 @@ typedef struct ConvolutionContext {
>  } ConvolutionContext;
>  
>  void ff_convolution_init_x86(ConvolutionContext *s);
> +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes);
> +
> +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int
> stride,
> +                      int x, int w, int y, int h, int bpc)
> +{
> +    int i;
> +
> +    for (i = 0; i < 9; i++) {
> +        int xoff = FFABS(x + ((i % 3) - 1));
> +        int yoff = FFABS(y + (i / 3) - 1);
> +
> +        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
> +        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
> +
> +        c[i] = src + xoff * bpc + yoff * stride;
> +    }
> +}
> +
> +static void filter_sobel(uint8_t *dst, int width,
> +                         float scale, float delta, const int *const matrix,
> +                         const uint8_t *c[], int peak, int radius,
> +                         int dstride, int stride, int size)
> +{
> +    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
> +    const uint8_t *c3 = c[3], *c5 = c[5];
> +    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
> +    int x;
> +
> +    for (x = 0; x < width; x++) {
> +        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
> +                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
> +        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
> +                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
> +
> +        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta);
> +    }
> +}
> +
> +static void filter16_sobel(uint8_t *dstp, int width,
> +                           float scale, float delta, const int *const matrix,
> +                           const uint8_t *c[], int peak, int radius,
> +                           int dstride, int stride, int size)
> +{
> +    uint16_t *dst = (uint16_t *)dstp;
> +    int x;
> +
> +    for (x = 0; x < width; x++) {
> +        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) *
> -2 + AV_RN16A(&c[2][2 * x]) * -1 +
> +                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x])
> *  2 + AV_RN16A(&c[8][2 * x]) *  1;
> +        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x])
> *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
> +                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) *
> -1 + AV_RN16A(&c[8][2 * x]) *  1;
> +
> +        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0,
> peak);
> +    }
> +}
> +
> +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int
> nb_planes)
> +{
> +    for (int i = 0; i < 4; i++) {
> +        s->filter[i] = filter_sobel;
> +        s->copy[i] = !((1 << i) & s->planes);
> +        s->size[i] = 3;
> +        s->setup[i] = setup_3x3;
> +        s->rdiv[i] = s->scale;
> +        s->bias[i] = s->delta;
> +    }
> +    if (s->depth > 8)
> +        for (int i = 0; i < 4; i++)
> +            s->filter[i] = filter16_sobel;
> +#if ARCH_X86_64
> +    ff_sobel_init_x86(s, depth, nb_planes);
> +#endif
> +}
>  #endif
> diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
> index 9a9c099e6d..7762fa2a05 100644
> --- a/libavfilter/vf_convolution.c
> +++ b/libavfilter/vf_convolution.c
> @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int width,
>      }
>  }
>  
> -static void filter16_sobel(uint8_t *dstp, int width,
> -                           float scale, float delta, const int *const matrix,
> -                           const uint8_t *c[], int peak, int radius,
> -                           int dstride, int stride, int size)
> -{
> -    uint16_t *dst = (uint16_t *)dstp;
> -    int x;
> -
> -    for (x = 0; x < width; x++) {
> -        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) *
> -2 + AV_RN16A(&c[2][2 * x]) * -1 +
> -                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x])
> *  2 + AV_RN16A(&c[8][2 * x]) *  1;
> -        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x])
> *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
> -                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) *
> -1 + AV_RN16A(&c[8][2 * x]) *  1;
> -
> -        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0,
> peak);
> -    }
> -}
> -
>  static void filter16_scharr(uint8_t *dstp, int width,
>                              float scale, float delta, const int *const
> matrix,
>                              const uint8_t *c[], int peak, int radius,
> @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width,
>      }
>  }
>  
> -static void filter_sobel(uint8_t *dst, int width,
> -                         float scale, float delta, const int *const matrix,
> -                         const uint8_t *c[], int peak, int radius,
> -                         int dstride, int stride, int size)
> -{
> -    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
> -    const uint8_t *c3 = c[3], *c5 = c[5];
> -    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
> -    int x;
> -
> -    for (x = 0; x < width; x++) {
> -        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
> -                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
> -        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
> -                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
> -
> -        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta);
> -    }
> -}
> -
>  static void filter_scharr(uint8_t *dst, int width,
>                            float scale, float delta, const int *const matrix,
>                            const uint8_t *c[], int peak, int radius,
> @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height,
>      }
>  }
>  
> -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int
> stride,
> -                      int x, int w, int y, int h, int bpc)
> -{
> -    int i;
> -
> -    for (i = 0; i < 9; i++) {
> -        int xoff = FFABS(x + ((i % 3) - 1));
> -        int yoff = FFABS(y + (i / 3) - 1);
> -
> -        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
> -        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
> -
> -        c[i] = src + xoff * bpc + yoff * stride;
> -    }
> -}
> -
>  static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, int
> stride,
>                        int x, int w, int y, int h, int bpc)
>  {
> @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx)
>      const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
>      int p, i;
>  
> +    s->depth = desc->comp[0].depth;
> +    s->max = (1 << s->depth) - 1;
> +
> +    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc-
> >log2_chroma_w);
> +    s->planewidth[0] = s->planewidth[3] = inlink->w;
> +    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc-
> >log2_chroma_h);
> +    s->planeheight[0] = s->planeheight[3] = inlink->h;
> +
> +    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> +    s->nb_threads = ff_filter_get_nb_threads(ctx);
> +    s->bpc = (s->depth + 7) / 8;
> +
>      if (!strcmp(ctx->filter->name, "convolution")) {
>          for (i = 0; i < 4; i++) {
>              int *matrix = (int *)s->matrix[i];
> @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx)
>              s->bias[i] = s->delta;
>          }
>      } else if (!strcmp(ctx->filter->name, "sobel")) {
> -        for (i = 0; i < 4; i++) {
> -            s->filter[i] = filter_sobel;
> -            s->copy[i] = !((1 << i) & s->planes);
> -            s->size[i] = 3;
> -            s->setup[i] = setup_3x3;
> -            s->rdiv[i] = s->scale;
> -            s->bias[i] = s->delta;
> -        }
> +        ff_sobel_init(s, s->depth, s->nb_planes);
>      } else if (!strcmp(ctx->filter->name, "kirsch")) {
>          for (i = 0; i < 4; i++) {
>              s->filter[i] = filter_kirsch;
> @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx)
>          }
>      }
>  
> -    s->depth = desc->comp[0].depth;
> -    s->max = (1 << s->depth) - 1;
> -
> -    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc-
> >log2_chroma_w);
> -    s->planewidth[0] = s->planewidth[3] = inlink->w;
> -    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc-
> >log2_chroma_h);
> -    s->planeheight[0] = s->planeheight[3] = inlink->h;
> -
> -    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> -    s->nb_threads = ff_filter_get_nb_threads(ctx);
> -    s->bpc = (s->depth + 7) / 8;
> -
>      if (!strcmp(ctx->filter->name, "convolution")) {
>          if (s->depth > 8) {
>              for (p = 0; p < s->nb_planes; p++) {
> @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx)
>          if (s->depth > 8)
>              for (p = 0; p < s->nb_planes; p++)
>                  s->filter[p] = filter16_roberts;
> -    } else if (!strcmp(ctx->filter->name, "sobel")) {
> -        if (s->depth > 8)
> -            for (p = 0; p < s->nb_planes; p++)
> -                s->filter[p] = filter16_sobel;
>      } else if (!strcmp(ctx->filter->name, "kirsch")) {
>          if (s->depth > 8)
>              for (p = 0; p < s->nb_planes; p++)
> diff --git a/libavfilter/x86/vf_convolution.asm
> b/libavfilter/x86/vf_convolution.asm
> index 754d4d1064..c912d56752 100644
> --- a/libavfilter/x86/vf_convolution.asm
> +++ b/libavfilter/x86/vf_convolution.asm
> @@ -22,6 +22,18 @@
>  
>  SECTION_RODATA
>  half:   dd 0.5
> +data_p1: dd  1
> +data_n1: dd -1
> +data_p2: dd  2
> +data_n2: dd -2
> +
> +ALIGN 64
> +sobel_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
> +            db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
> +            db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
> +            db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
> +sobel_mulA: db -1,  1, -2,  2
> +sobel_mulB: db  1, -1,  2, -2
>  
>  SECTION .text
>  
> @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias,
> matrix, ptr, c0, c1, c2, c
>  INIT_XMM sse4
>  FILTER_3X3
>  %endif
> +
> +%macro SOBEL_MUL 2
> +    movzx ptrd, byte [c%1q + xq]
> +    imul  ptrd, [%2]
> +    add   rd, ptrd
> +%endmacro
> +
> +%macro SOBEL_ADD 1
> +    movzx ptrd, byte [c%1q + xq]
> +    add   rd, ptrd
> +%endmacro
> +
> +; void filter_sobel_avx512(uint8_t *dst, int width,
> +;                      float scale, float delta, const int *const matrix,
> +;                      const uint8_t *c[], int peak, int radius,
> +;                      int dstride, int stride)
> +%macro FILTER_SOBEL 0
> +%if UNIX64
> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4,
> c5, c6, c7, c8, r, x
> +%else
> +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1,
> c2, c3, c4, c5, c6, c7, c8, r, x
> +%endif
> +%if WIN64
> +    SWAP xmm0, xmm2
> +    SWAP xmm1, xmm3
> +    mov  r2q, matrixmp
> +    mov  r3q, ptrmp
> +    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8,
> r, x
> +%endif
> +    movsxdifnidn widthq, widthd
> +    VBROADCASTSS m0, xmm0
> +    VBROADCASTSS m1, xmm1
> +    pxor  m6, m6
> +    mov   c0q, [ptrq + 0*gprsize]
> +    mov   c1q, [ptrq + 1*gprsize]
> +    mov   c2q, [ptrq + 2*gprsize]
> +    mov   c3q, [ptrq + 3*gprsize]
> +    mov   c4q, [ptrq + 4*gprsize]
> +    mov   c5q, [ptrq + 5*gprsize]
> +    mov   c6q, [ptrq + 6*gprsize]
> +    mov   c7q, [ptrq + 7*gprsize]
> +    mov   c8q, [ptrq + 8*gprsize]
> +
> +    xor   xq, xq
> +    cmp   widthq, mmsize/4
> +    jl .loop2
> +
> +    mov   rq, widthq
> +    and   rq, mmsize/4-1
> +    sub   widthq, rq
> +
> +    mova  m6, [sobel_perm]
> +.loop1:
> +    movu          xm3, [c2q + xq]
> +    pmovzxbd      m5, [c0q + xq]
> +    vinserti32x4  ym3, [c6q + xq], 1
> +    pmovzxbd      m4, [c8q + xq]
> +    vinserti32x4  m2, m3, [c1q + xq], 2
> +    vinserti32x4  m3, [c5q + xq], 2
> +    vinserti32x4  m2, [c7q + xq], 3
> +    vinserti32x4  m3, [c3q + xq], 3
> +    vpermb        m2, m6, m2
> +    psubd         m4, m5
> +    vpermb        m3, m6, m3
> +    mova          m5, m4
> +    vpdpbusd      m4, m2, [sobel_mulA] {1to16}
> +    vpdpbusd      m5, m3, [sobel_mulB] {1to16}
> +
> +    cvtdq2ps  m4, m4
> +    mulps     m4, m4
> +
> +    cvtdq2ps    m5, m5
> +    VFMADD231PS m4, m5, m5
> +
> +    sqrtps    m4, m4
> +    fmaddps m4, m4, m0, m1
> +    cvttps2dq m4, m4
> +    vpmovusdb [dstq + xq], m4
> +
> +    add xq, mmsize/4
> +    cmp xq, widthq
> +    jl .loop1
> +
> +    add widthq, rq
> +    cmp xq, widthq
> +    jge .end
> +
> +.loop2:
> +    xor  rd, rd
> +    pxor m4, m4
> +
> +    ;Gx
> +    SOBEL_MUL 0, data_n1
> +    SOBEL_MUL 1, data_n2
> +    SOBEL_MUL 2, data_n1
> +    SOBEL_ADD 6
> +    SOBEL_MUL 7, data_p2
> +    SOBEL_ADD 8
> +
> +    cvtsi2ss xmm4, rd
> +    mulss    xmm4, xmm4
> +
> +    xor rd, rd
> +    ;Gy
> +    SOBEL_MUL 0, data_n1
> +    SOBEL_ADD 2
> +    SOBEL_MUL 3, data_n2
> +    SOBEL_MUL 5, data_p2
> +    SOBEL_MUL 6, data_n1
> +    SOBEL_ADD 8
> +
> +    cvtsi2ss  xmm5, rd
> +    fmaddss xmm4, xmm5, xmm5, xmm4
> +
> +    sqrtps    xmm4, xmm4
> +    fmaddss   xmm4, xmm4, xmm0, xmm1     ;sum = sum * rdiv + bias
> +    cvttps2dq xmm4, xmm4     ; trunc to integer
> +    packssdw  xmm4, xmm4
> +    packuswb  xmm4, xmm4
> +    movd      rd, xmm4
> +    mov       [dstq + xq], rb
> +
> +    add xq, 1
> +    cmp xq, widthq
> +    jl .loop2
> +.end:
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +%if HAVE_AVX512ICL_EXTERNAL
> +INIT_ZMM avx512icl
> +FILTER_SOBEL
> +%endif
> +%endif
> diff --git a/libavfilter/x86/vf_convolution_init.c
> b/libavfilter/x86/vf_convolution_init.c
> index b78a47d02b..bff10ca1a4 100644
> --- a/libavfilter/x86/vf_convolution_init.c
> +++ b/libavfilter/x86/vf_convolution_init.c
> @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
>                          const uint8_t *c[], int peak, int radius,
>                          int dstride, int stride, int size);
>  
> +void ff_filter_sobel_avx512icl(uint8_t *dst, int width,
> +                         float scale, float delta, const int *const matrix,
> +                         const uint8_t *c[], int peak, int radius,
> +                         int dstride, int stride, int size);
> +
>  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
>  {
>  #if ARCH_X86_64
> @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
>      }
>  #endif
>  }
> +
> +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int
> nb_planes)
> +{
> +#if ARCH_X86_64
> +    int cpu_flags = av_get_cpu_flags();
> +    for (int i = 0; i < nb_planes; i++) {
> +        if (depth == 8) {
> +            if (EXTERNAL_AVX512ICL(cpu_flags))
> +                s->filter[i] = ff_filter_sobel_avx512icl;
> +        }
> +    }
> +#endif
> +}
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index 62d6616faf..a6f06c7007 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
>  AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
>  AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
>  AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
> +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
>  
>  CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
>  
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index 421bd096c5..3eb4780a64 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -197,6 +197,9 @@ static const struct {
>      #if CONFIG_THRESHOLD_FILTER
>          { "vf_threshold", checkasm_check_vf_threshold },
>      #endif
> +    #if CONFIG_SOBEL_FILTER
> +        { "vf_sobel", checkasm_check_vf_sobel },
> +    #endif
>  #endif
>  #if CONFIG_SWSCALE
>      { "sw_gbrp", checkasm_check_sw_gbrp },
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index ee9151410e..214918e7ea 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void);
>  void checkasm_check_vf_gblur(void);
>  void checkasm_check_vf_hflip(void);
>  void checkasm_check_vf_threshold(void);
> +void checkasm_check_vf_sobel(void);
>  void checkasm_check_vp8dsp(void);
>  void checkasm_check_vp9dsp(void);
>  void checkasm_check_videodsp(void);
> diff --git a/tests/checkasm/vf_convolution.c b/tests/checkasm/vf_convolution.c
> new file mode 100644
> index 0000000000..007865863e
> --- /dev/null
> +++ b/tests/checkasm/vf_convolution.c
> @@ -0,0 +1,104 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +#include "checkasm.h"
> +#include "libavfilter/avfilter.h"
> +#include "libavfilter/convolution.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#define WIDTH 512
> +#define HEIGHT 512
> +#define SRC_STRIDE 512
> +#define PIXELS (WIDTH * HEIGHT)
> +
> +#define randomize_buffers(buf, size)      \
> +    do {                                  \
> +        int j;                            \
> +        uint8_t *tmp_buf = (uint8_t *)buf;\
> +        for (j = 0; j< size; j++)         \
> +            tmp_buf[j] = rnd() & 0xFF;    \
> +    } while (0)
> +
> +static void check_sobel(const char * report_name)
> +{
> +    LOCAL_ALIGNED_32(uint8_t, src,     [PIXELS]);
> +    LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]);
> +    LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]);
> +    const int height = WIDTH;
> +    const int width  = HEIGHT;
> +    const int stride = SRC_STRIDE;
> +    const int dstride = SRC_STRIDE;
> +    int mode = 0;
> +    const uint8_t *c[49];
> +    const int radius = 1;
> +    const int bpc = 1;
> +    const int step = mode == MATRIX_COLUMN ? 16 : 1;
> +    const int slice_start = 0;
> +    const int slice_end = height;
> +    int y;
> +    const int sizew = mode == MATRIX_COLUMN ? height : width;
> +    float scale = 2;
> +    float delta = 10;
> +
> +    ConvolutionContext s;
> +
> +    declare_func(void, uint8_t *dst, int width, float scale, float delta,
> const int *const matrix,
> +                 const uint8_t *c[], int peak, int radius, int dstride, int
> stride, int size);
> +
> +    s.scale = scale;
> +    s.delta = delta;
> +    s.depth = 8;
> +    s.nb_planes = 3;
> +    s.planes = 15;
> +    ff_sobel_init(&s, s.depth, s.nb_planes);
> +
> +    memset(dst_ref, 0, PIXELS);
> +    memset(dst_new, 0, PIXELS);
> +    randomize_buffers(src, PIXELS);
> +
> +    if (check_func(s.filter[0], "%s", report_name)) {
> +        for (y = slice_start; y < slice_end; y += step) {
> +            const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc
> : radius * bpc;
> +            const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0;
> +
> +            s.setup[0](radius, c, src, stride, radius, width, y, height,
> bpc);
> +            call_ref(dst_ref + yoff + xoff, sizew - 2 * radius,
> +                     scale, delta, NULL, c, 0, radius,
> +                     dstride, stride, slice_end - step);
> +            call_new(dst_new + yoff + xoff, sizew - 2 * radius,
> +                     scale, delta, NULL, c, 0, radius,
> +                     dstride, stride, slice_end - step);
> +            if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff,
> slice_end - step))
> +                fail();
> +            bench_new(dst_new + yoff + xoff, sizew - 2 * radius,
> +                      scale, delta, NULL, c, 0, radius,
> +                      dstride, stride, slice_end - step);
> +            if (mode != MATRIX_COLUMN)
> +                dst_ref += dstride;
> +        }
> +    }
> +
> +}
> +
> +void checkasm_check_vf_sobel(void)
> +{
> +    check_sobel("sobel");
> +    report("convolution:sobel");
> +}
> diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
> index aa9b288e12..a4e95541f5 100644
> --- a/tests/fate/checkasm.mak
> +++ b/tests/fate/checkasm.mak
> @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm-
> aacpsdsp                                  \
>                  fate-checkasm-vf_hflip                                  \
>                  fate-checkasm-vf_nlmeans                                \
>                  fate-checkasm-vf_threshold                              \
> +                fate-checkasm-vf_sobel                                  \
>                  fate-checkasm-videodsp                                  \
>                  fate-checkasm-vorbisdsp                                 \
>                  fate-checkasm-vp8dsp                                    \

LGTM and it works well for me, I saw a significant FPS improvement when running
the command below. 

$ ffmpeg -i 1920x1080.mp4 -vf "sobel" -f null -

Thanks
Haihao
Xiang, Haihao Nov. 11, 2022, 3 a.m. UTC | #2
On Mon, 2022-11-07 at 05:24 +0000, Xiang, Haihao wrote:
> On Fri, 2022-11-04 at 16:29 +0800, bin.wang-at-intel.com@ffmpeg.org wrote:
> > From: bwang30 <bin.wang@intel.com>
> > 
> > This commit enabled assembly code with intel AVX512 VNNI and added unit test
> > for sobel filter
> > 
> > sobel_c: 4537
> > sobel_avx512icl 2136
> > 
> > Signed-off-by: bwang30 <bin.wang@intel.com>
> > ---
> >  libavfilter/convolution.h             |  74 +++++++++++++
> >  libavfilter/vf_convolution.c          |  91 +++-------------
> >  libavfilter/x86/vf_convolution.asm    | 147 ++++++++++++++++++++++++++
> >  libavfilter/x86/vf_convolution_init.c |  18 ++++
> >  tests/checkasm/Makefile               |   1 +
> >  tests/checkasm/checkasm.c             |   3 +
> >  tests/checkasm/checkasm.h             |   1 +
> >  tests/checkasm/vf_convolution.c       | 104 ++++++++++++++++++
> >  tests/fate/checkasm.mak               |   1 +
> >  9 files changed, 362 insertions(+), 78 deletions(-)
> >  create mode 100644 tests/checkasm/vf_convolution.c
> > 
> > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h
> > index 88aabe9a20..e44bfb5da8 100644
> > --- a/libavfilter/convolution.h
> > +++ b/libavfilter/convolution.h
> > @@ -21,6 +21,7 @@
> >  #ifndef AVFILTER_CONVOLUTION_H
> >  #define AVFILTER_CONVOLUTION_H
> >  #include "avfilter.h"
> > +#include "libavutil/intreadwrite.h"
> >  
> >  enum MatrixMode {
> >      MATRIX_SQUARE,
> > @@ -61,4 +62,77 @@ typedef struct ConvolutionContext {
> >  } ConvolutionContext;
> >  
> >  void ff_convolution_init_x86(ConvolutionContext *s);
> > +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes);
> > +
> > +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src,
> > int
> > stride,
> > +                      int x, int w, int y, int h, int bpc)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < 9; i++) {
> > +        int xoff = FFABS(x + ((i % 3) - 1));
> > +        int yoff = FFABS(y + (i / 3) - 1);
> > +
> > +        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
> > +        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
> > +
> > +        c[i] = src + xoff * bpc + yoff * stride;
> > +    }
> > +}
> > +
> > +static void filter_sobel(uint8_t *dst, int width,
> > +                         float scale, float delta, const int *const matrix,
> > +                         const uint8_t *c[], int peak, int radius,
> > +                         int dstride, int stride, int size)
> > +{
> > +    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
> > +    const uint8_t *c3 = c[3], *c5 = c[5];
> > +    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
> > +    int x;
> > +
> > +    for (x = 0; x < width; x++) {
> > +        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
> > +                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
> > +        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
> > +                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
> > +
> > +        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale +
> > delta);
> > +    }
> > +}
> > +
> > +static void filter16_sobel(uint8_t *dstp, int width,
> > +                           float scale, float delta, const int *const
> > matrix,
> > +                           const uint8_t *c[], int peak, int radius,
> > +                           int dstride, int stride, int size)
> > +{
> > +    uint16_t *dst = (uint16_t *)dstp;
> > +    int x;
> > +
> > +    for (x = 0; x < width; x++) {
> > +        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) *
> > -2 + AV_RN16A(&c[2][2 * x]) * -1 +
> > +                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x])
> > *  2 + AV_RN16A(&c[8][2 * x]) *  1;
> > +        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x])
> > *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
> > +                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) *
> > -1 + AV_RN16A(&c[8][2 * x]) *  1;
> > +
> > +        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0,
> > peak);
> > +    }
> > +}
> > +
> > +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int
> > nb_planes)
> > +{
> > +    for (int i = 0; i < 4; i++) {
> > +        s->filter[i] = filter_sobel;
> > +        s->copy[i] = !((1 << i) & s->planes);
> > +        s->size[i] = 3;
> > +        s->setup[i] = setup_3x3;
> > +        s->rdiv[i] = s->scale;
> > +        s->bias[i] = s->delta;
> > +    }
> > +    if (s->depth > 8)
> > +        for (int i = 0; i < 4; i++)
> > +            s->filter[i] = filter16_sobel;
> > +#if ARCH_X86_64
> > +    ff_sobel_init_x86(s, depth, nb_planes);
> > +#endif
> > +}
> >  #endif
> > diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
> > index 9a9c099e6d..7762fa2a05 100644
> > --- a/libavfilter/vf_convolution.c
> > +++ b/libavfilter/vf_convolution.c
> > @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int width,
> >      }
> >  }
> >  
> > -static void filter16_sobel(uint8_t *dstp, int width,
> > -                           float scale, float delta, const int *const
> > matrix,
> > -                           const uint8_t *c[], int peak, int radius,
> > -                           int dstride, int stride, int size)
> > -{
> > -    uint16_t *dst = (uint16_t *)dstp;
> > -    int x;
> > -
> > -    for (x = 0; x < width; x++) {
> > -        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) *
> > -2 + AV_RN16A(&c[2][2 * x]) * -1 +
> > -                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x])
> > *  2 + AV_RN16A(&c[8][2 * x]) *  1;
> > -        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x])
> > *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
> > -                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) *
> > -1 + AV_RN16A(&c[8][2 * x]) *  1;
> > -
> > -        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0,
> > peak);
> > -    }
> > -}
> > -
> >  static void filter16_scharr(uint8_t *dstp, int width,
> >                              float scale, float delta, const int *const
> > matrix,
> >                              const uint8_t *c[], int peak, int radius,
> > @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width,
> >      }
> >  }
> >  
> > -static void filter_sobel(uint8_t *dst, int width,
> > -                         float scale, float delta, const int *const matrix,
> > -                         const uint8_t *c[], int peak, int radius,
> > -                         int dstride, int stride, int size)
> > -{
> > -    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
> > -    const uint8_t *c3 = c[3], *c5 = c[5];
> > -    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
> > -    int x;
> > -
> > -    for (x = 0; x < width; x++) {
> > -        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
> > -                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
> > -        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
> > -                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
> > -
> > -        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale +
> > delta);
> > -    }
> > -}
> > -
> >  static void filter_scharr(uint8_t *dst, int width,
> >                            float scale, float delta, const int *const
> > matrix,
> >                            const uint8_t *c[], int peak, int radius,
> > @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height,
> >      }
> >  }
> >  
> > -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src,
> > int
> > stride,
> > -                      int x, int w, int y, int h, int bpc)
> > -{
> > -    int i;
> > -
> > -    for (i = 0; i < 9; i++) {
> > -        int xoff = FFABS(x + ((i % 3) - 1));
> > -        int yoff = FFABS(y + (i / 3) - 1);
> > -
> > -        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
> > -        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
> > -
> > -        c[i] = src + xoff * bpc + yoff * stride;
> > -    }
> > -}
> > -
> >  static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src,
> > int
> > stride,
> >                        int x, int w, int y, int h, int bpc)
> >  {
> > @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx)
> >      const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
> >      int p, i;
> >  
> > +    s->depth = desc->comp[0].depth;
> > +    s->max = (1 << s->depth) - 1;
> > +
> > +    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc-
> > > log2_chroma_w);
> > +    s->planewidth[0] = s->planewidth[3] = inlink->w;
> > +    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc-
> > > log2_chroma_h);
> > +    s->planeheight[0] = s->planeheight[3] = inlink->h;
> > +
> > +    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> > +    s->nb_threads = ff_filter_get_nb_threads(ctx);
> > +    s->bpc = (s->depth + 7) / 8;
> > +
> >      if (!strcmp(ctx->filter->name, "convolution")) {
> >          for (i = 0; i < 4; i++) {
> >              int *matrix = (int *)s->matrix[i];
> > @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx)
> >              s->bias[i] = s->delta;
> >          }
> >      } else if (!strcmp(ctx->filter->name, "sobel")) {
> > -        for (i = 0; i < 4; i++) {
> > -            s->filter[i] = filter_sobel;
> > -            s->copy[i] = !((1 << i) & s->planes);
> > -            s->size[i] = 3;
> > -            s->setup[i] = setup_3x3;
> > -            s->rdiv[i] = s->scale;
> > -            s->bias[i] = s->delta;
> > -        }
> > +        ff_sobel_init(s, s->depth, s->nb_planes);
> >      } else if (!strcmp(ctx->filter->name, "kirsch")) {
> >          for (i = 0; i < 4; i++) {
> >              s->filter[i] = filter_kirsch;
> > @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx)
> >          }
> >      }
> >  
> > -    s->depth = desc->comp[0].depth;
> > -    s->max = (1 << s->depth) - 1;
> > -
> > -    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc-
> > > log2_chroma_w);
> > -    s->planewidth[0] = s->planewidth[3] = inlink->w;
> > -    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc-
> > > log2_chroma_h);
> > -    s->planeheight[0] = s->planeheight[3] = inlink->h;
> > -
> > -    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> > -    s->nb_threads = ff_filter_get_nb_threads(ctx);
> > -    s->bpc = (s->depth + 7) / 8;
> > -
> >      if (!strcmp(ctx->filter->name, "convolution")) {
> >          if (s->depth > 8) {
> >              for (p = 0; p < s->nb_planes; p++) {
> > @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx)
> >          if (s->depth > 8)
> >              for (p = 0; p < s->nb_planes; p++)
> >                  s->filter[p] = filter16_roberts;
> > -    } else if (!strcmp(ctx->filter->name, "sobel")) {
> > -        if (s->depth > 8)
> > -            for (p = 0; p < s->nb_planes; p++)
> > -                s->filter[p] = filter16_sobel;
> >      } else if (!strcmp(ctx->filter->name, "kirsch")) {
> >          if (s->depth > 8)
> >              for (p = 0; p < s->nb_planes; p++)
> > diff --git a/libavfilter/x86/vf_convolution.asm
> > b/libavfilter/x86/vf_convolution.asm
> > index 754d4d1064..c912d56752 100644
> > --- a/libavfilter/x86/vf_convolution.asm
> > +++ b/libavfilter/x86/vf_convolution.asm
> > @@ -22,6 +22,18 @@
> >  
> >  SECTION_RODATA
> >  half:   dd 0.5
> > +data_p1: dd  1
> > +data_n1: dd -1
> > +data_p2: dd  2
> > +data_n2: dd -2
> > +
> > +ALIGN 64
> > +sobel_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35,
> > 51
> > +            db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39,
> > 55
> > +            db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43,
> > 59
> > +            db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47,
> > 63
> > +sobel_mulA: db -1,  1, -2,  2
> > +sobel_mulB: db  1, -1,  2, -2
> >  
> >  SECTION .text
> >  
> > @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias,
> > matrix, ptr, c0, c1, c2, c
> >  INIT_XMM sse4
> >  FILTER_3X3
> >  %endif
> > +
> > +%macro SOBEL_MUL 2
> > +    movzx ptrd, byte [c%1q + xq]
> > +    imul  ptrd, [%2]
> > +    add   rd, ptrd
> > +%endmacro
> > +
> > +%macro SOBEL_ADD 1
> > +    movzx ptrd, byte [c%1q + xq]
> > +    add   rd, ptrd
> > +%endmacro
> > +
> > +; void filter_sobel_avx512(uint8_t *dst, int width,
> > +;                      float scale, float delta, const int *const matrix,
> > +;                      const uint8_t *c[], int peak, int radius,
> > +;                      int dstride, int stride)
> > +%macro FILTER_SOBEL 0
> > +%if UNIX64
> > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3,
> > c4,
> > c5, c6, c7, c8, r, x
> > +%else
> > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0,
> > c1,
> > c2, c3, c4, c5, c6, c7, c8, r, x
> > +%endif
> > +%if WIN64
> > +    SWAP xmm0, xmm2
> > +    SWAP xmm1, xmm3
> > +    mov  r2q, matrixmp
> > +    mov  r3q, ptrmp
> > +    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7,
> > c8,
> > r, x
> > +%endif
> > +    movsxdifnidn widthq, widthd
> > +    VBROADCASTSS m0, xmm0
> > +    VBROADCASTSS m1, xmm1
> > +    pxor  m6, m6
> > +    mov   c0q, [ptrq + 0*gprsize]
> > +    mov   c1q, [ptrq + 1*gprsize]
> > +    mov   c2q, [ptrq + 2*gprsize]
> > +    mov   c3q, [ptrq + 3*gprsize]
> > +    mov   c4q, [ptrq + 4*gprsize]
> > +    mov   c5q, [ptrq + 5*gprsize]
> > +    mov   c6q, [ptrq + 6*gprsize]
> > +    mov   c7q, [ptrq + 7*gprsize]
> > +    mov   c8q, [ptrq + 8*gprsize]
> > +
> > +    xor   xq, xq
> > +    cmp   widthq, mmsize/4
> > +    jl .loop2
> > +
> > +    mov   rq, widthq
> > +    and   rq, mmsize/4-1
> > +    sub   widthq, rq
> > +
> > +    mova  m6, [sobel_perm]
> > +.loop1:
> > +    movu          xm3, [c2q + xq]
> > +    pmovzxbd      m5, [c0q + xq]
> > +    vinserti32x4  ym3, [c6q + xq], 1
> > +    pmovzxbd      m4, [c8q + xq]
> > +    vinserti32x4  m2, m3, [c1q + xq], 2
> > +    vinserti32x4  m3, [c5q + xq], 2
> > +    vinserti32x4  m2, [c7q + xq], 3
> > +    vinserti32x4  m3, [c3q + xq], 3
> > +    vpermb        m2, m6, m2
> > +    psubd         m4, m5
> > +    vpermb        m3, m6, m3
> > +    mova          m5, m4
> > +    vpdpbusd      m4, m2, [sobel_mulA] {1to16}
> > +    vpdpbusd      m5, m3, [sobel_mulB] {1to16}
> > +
> > +    cvtdq2ps  m4, m4
> > +    mulps     m4, m4
> > +
> > +    cvtdq2ps    m5, m5
> > +    VFMADD231PS m4, m5, m5
> > +
> > +    sqrtps    m4, m4
> > +    fmaddps m4, m4, m0, m1
> > +    cvttps2dq m4, m4
> > +    vpmovusdb [dstq + xq], m4
> > +
> > +    add xq, mmsize/4
> > +    cmp xq, widthq
> > +    jl .loop1
> > +
> > +    add widthq, rq
> > +    cmp xq, widthq
> > +    jge .end
> > +
> > +.loop2:
> > +    xor  rd, rd
> > +    pxor m4, m4
> > +
> > +    ;Gx
> > +    SOBEL_MUL 0, data_n1
> > +    SOBEL_MUL 1, data_n2
> > +    SOBEL_MUL 2, data_n1
> > +    SOBEL_ADD 6
> > +    SOBEL_MUL 7, data_p2
> > +    SOBEL_ADD 8
> > +
> > +    cvtsi2ss xmm4, rd
> > +    mulss    xmm4, xmm4
> > +
> > +    xor rd, rd
> > +    ;Gy
> > +    SOBEL_MUL 0, data_n1
> > +    SOBEL_ADD 2
> > +    SOBEL_MUL 3, data_n2
> > +    SOBEL_MUL 5, data_p2
> > +    SOBEL_MUL 6, data_n1
> > +    SOBEL_ADD 8
> > +
> > +    cvtsi2ss  xmm5, rd
> > +    fmaddss xmm4, xmm5, xmm5, xmm4
> > +
> > +    sqrtps    xmm4, xmm4
> > +    fmaddss   xmm4, xmm4, xmm0, xmm1     ;sum = sum * rdiv + bias
> > +    cvttps2dq xmm4, xmm4     ; trunc to integer
> > +    packssdw  xmm4, xmm4
> > +    packuswb  xmm4, xmm4
> > +    movd      rd, xmm4
> > +    mov       [dstq + xq], rb
> > +
> > +    add xq, 1
> > +    cmp xq, widthq
> > +    jl .loop2
> > +.end:
> > +    RET
> > +%endmacro
> > +
> > +%if ARCH_X86_64
> > +%if HAVE_AVX512ICL_EXTERNAL
> > +INIT_ZMM avx512icl
> > +FILTER_SOBEL
> > +%endif
> > +%endif
> > diff --git a/libavfilter/x86/vf_convolution_init.c
> > b/libavfilter/x86/vf_convolution_init.c
> > index b78a47d02b..bff10ca1a4 100644
> > --- a/libavfilter/x86/vf_convolution_init.c
> > +++ b/libavfilter/x86/vf_convolution_init.c
> > @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
> >                          const uint8_t *c[], int peak, int radius,
> >                          int dstride, int stride, int size);
> >  
> > +void ff_filter_sobel_avx512icl(uint8_t *dst, int width,
> > +                         float scale, float delta, const int *const matrix,
> > +                         const uint8_t *c[], int peak, int radius,
> > +                         int dstride, int stride, int size);
> > +
> >  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
> >  {
> >  #if ARCH_X86_64
> > @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext
> > *s)
> >      }
> >  #endif
> >  }
> > +
> > +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int
> > nb_planes)
> > +{
> > +#if ARCH_X86_64
> > +    int cpu_flags = av_get_cpu_flags();
> > +    for (int i = 0; i < nb_planes; i++) {
> > +        if (depth == 8) {
> > +            if (EXTERNAL_AVX512ICL(cpu_flags))
> > +                s->filter[i] = ff_filter_sobel_avx512icl;
> > +        }
> > +    }
> > +#endif
> > +}
> > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> > index 62d6616faf..a6f06c7007 100644
> > --- a/tests/checkasm/Makefile
> > +++ b/tests/checkasm/Makefile
> > @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
> >  AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
> >  AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
> >  AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
> > +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
> >  
> >  CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
> >  
> > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> > index 421bd096c5..3eb4780a64 100644
> > --- a/tests/checkasm/checkasm.c
> > +++ b/tests/checkasm/checkasm.c
> > @@ -197,6 +197,9 @@ static const struct {
> >      #if CONFIG_THRESHOLD_FILTER
> >          { "vf_threshold", checkasm_check_vf_threshold },
> >      #endif
> > +    #if CONFIG_SOBEL_FILTER
> > +        { "vf_sobel", checkasm_check_vf_sobel },
> > +    #endif
> >  #endif
> >  #if CONFIG_SWSCALE
> >      { "sw_gbrp", checkasm_check_sw_gbrp },
> > diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> > index ee9151410e..214918e7ea 100644
> > --- a/tests/checkasm/checkasm.h
> > +++ b/tests/checkasm/checkasm.h
> > @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void);
> >  void checkasm_check_vf_gblur(void);
> >  void checkasm_check_vf_hflip(void);
> >  void checkasm_check_vf_threshold(void);
> > +void checkasm_check_vf_sobel(void);
> >  void checkasm_check_vp8dsp(void);
> >  void checkasm_check_vp9dsp(void);
> >  void checkasm_check_videodsp(void);
> > diff --git a/tests/checkasm/vf_convolution.c
> > b/tests/checkasm/vf_convolution.c
> > new file mode 100644
> > index 0000000000..007865863e
> > --- /dev/null
> > +++ b/tests/checkasm/vf_convolution.c
> > @@ -0,0 +1,104 @@
> > +/*
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> > + */
> > +
> > +#include <string.h>
> > +#include "checkasm.h"
> > +#include "libavfilter/avfilter.h"
> > +#include "libavfilter/convolution.h"
> > +#include "libavutil/intreadwrite.h"
> > +#include "libavutil/mem_internal.h"
> > +
> > +#define WIDTH 512
> > +#define HEIGHT 512
> > +#define SRC_STRIDE 512
> > +#define PIXELS (WIDTH * HEIGHT)
> > +
> > +#define randomize_buffers(buf, size)      \
> > +    do {                                  \
> > +        int j;                            \
> > +        uint8_t *tmp_buf = (uint8_t *)buf;\
> > +        for (j = 0; j< size; j++)         \
> > +            tmp_buf[j] = rnd() & 0xFF;    \
> > +    } while (0)
> > +
> > +static void check_sobel(const char * report_name)
> > +{
> > +    LOCAL_ALIGNED_32(uint8_t, src,     [PIXELS]);
> > +    LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]);
> > +    LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]);
> > +    const int height = WIDTH;
> > +    const int width  = HEIGHT;
> > +    const int stride = SRC_STRIDE;
> > +    const int dstride = SRC_STRIDE;
> > +    int mode = 0;
> > +    const uint8_t *c[49];
> > +    const int radius = 1;
> > +    const int bpc = 1;
> > +    const int step = mode == MATRIX_COLUMN ? 16 : 1;
> > +    const int slice_start = 0;
> > +    const int slice_end = height;
> > +    int y;
> > +    const int sizew = mode == MATRIX_COLUMN ? height : width;
> > +    float scale = 2;
> > +    float delta = 10;
> > +
> > +    ConvolutionContext s;
> > +
> > +    declare_func(void, uint8_t *dst, int width, float scale, float delta,
> > const int *const matrix,
> > +                 const uint8_t *c[], int peak, int radius, int dstride, int
> > stride, int size);
> > +
> > +    s.scale = scale;
> > +    s.delta = delta;
> > +    s.depth = 8;
> > +    s.nb_planes = 3;
> > +    s.planes = 15;
> > +    ff_sobel_init(&s, s.depth, s.nb_planes);
> > +
> > +    memset(dst_ref, 0, PIXELS);
> > +    memset(dst_new, 0, PIXELS);
> > +    randomize_buffers(src, PIXELS);
> > +
> > +    if (check_func(s.filter[0], "%s", report_name)) {
> > +        for (y = slice_start; y < slice_end; y += step) {
> > +            const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) *
> > bpc
> > : radius * bpc;
> > +            const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0;
> > +
> > +            s.setup[0](radius, c, src, stride, radius, width, y, height,
> > bpc);
> > +            call_ref(dst_ref + yoff + xoff, sizew - 2 * radius,
> > +                     scale, delta, NULL, c, 0, radius,
> > +                     dstride, stride, slice_end - step);
> > +            call_new(dst_new + yoff + xoff, sizew - 2 * radius,
> > +                     scale, delta, NULL, c, 0, radius,
> > +                     dstride, stride, slice_end - step);
> > +            if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff,
> > slice_end - step))
> > +                fail();
> > +            bench_new(dst_new + yoff + xoff, sizew - 2 * radius,
> > +                      scale, delta, NULL, c, 0, radius,
> > +                      dstride, stride, slice_end - step);
> > +            if (mode != MATRIX_COLUMN)
> > +                dst_ref += dstride;
> > +        }
> > +    }
> > +
> > +}
> > +
> > +void checkasm_check_vf_sobel(void)
> > +{
> > +    check_sobel("sobel");
> > +    report("convolution:sobel");
> > +}
> > diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
> > index aa9b288e12..a4e95541f5 100644
> > --- a/tests/fate/checkasm.mak
> > +++ b/tests/fate/checkasm.mak
> > @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm-
> > aacpsdsp                                  \
> >                  fate-checkasm-vf_hflip                                  \
> >                  fate-checkasm-vf_nlmeans                                \
> >                  fate-checkasm-vf_threshold                              \
> > +                fate-checkasm-vf_sobel                                  \
> >                  fate-checkasm-videodsp                                  \
> >                  fate-checkasm-vorbisdsp                                 \
> >                  fate-checkasm-vp8dsp                                    \
> 
> LGTM and it works well for me, I saw a significant FPS improvement when
> running
> the command below. 
> 
> $ ffmpeg -i 1920x1080.mp4 -vf "sobel" -f null -
> 

Does anyone else have any thought on this patch? I will merge it if there are no
more comments. 

Thanks
Haihao
Xiang, Haihao Nov. 14, 2022, 2:12 a.m. UTC | #3
On Fri, 2022-11-11 at 03:00 +0000, Xiang, Haihao wrote:
> On Mon, 2022-11-07 at 05:24 +0000, Xiang, Haihao wrote:
> > On Fri, 2022-11-04 at 16:29 +0800, bin.wang-at-intel.com@ffmpeg.org wrote:
> > > From: bwang30 <bin.wang@intel.com>
> > > 
> > > This commit enabled assembly code with intel AVX512 VNNI and added unit
> > > test
> > > for sobel filter
> > > 
> > > sobel_c: 4537
> > > sobel_avx512icl 2136
> > > 
> > > Signed-off-by: bwang30 <bin.wang@intel.com>
> > > ---
> > >  libavfilter/convolution.h             |  74 +++++++++++++
> > >  libavfilter/vf_convolution.c          |  91 +++-------------
> > >  libavfilter/x86/vf_convolution.asm    | 147 ++++++++++++++++++++++++++
> > >  libavfilter/x86/vf_convolution_init.c |  18 ++++
> > >  tests/checkasm/Makefile               |   1 +
> > >  tests/checkasm/checkasm.c             |   3 +
> > >  tests/checkasm/checkasm.h             |   1 +
> > >  tests/checkasm/vf_convolution.c       | 104 ++++++++++++++++++
> > >  tests/fate/checkasm.mak               |   1 +
> > >  9 files changed, 362 insertions(+), 78 deletions(-)
> > >  create mode 100644 tests/checkasm/vf_convolution.c
> > > 
> > > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h
> > > index 88aabe9a20..e44bfb5da8 100644
> > > --- a/libavfilter/convolution.h
> > > +++ b/libavfilter/convolution.h
> > > @@ -21,6 +21,7 @@
> > >  #ifndef AVFILTER_CONVOLUTION_H
> > >  #define AVFILTER_CONVOLUTION_H
> > >  #include "avfilter.h"
> > > +#include "libavutil/intreadwrite.h"
> > >  
> > >  enum MatrixMode {
> > >      MATRIX_SQUARE,
> > > @@ -61,4 +62,77 @@ typedef struct ConvolutionContext {
> > >  } ConvolutionContext;
> > >  
> > >  void ff_convolution_init_x86(ConvolutionContext *s);
> > > +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes);
> > > +
> > > +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src,
> > > int
> > > stride,
> > > +                      int x, int w, int y, int h, int bpc)
> > > +{
> > > +    int i;
> > > +
> > > +    for (i = 0; i < 9; i++) {
> > > +        int xoff = FFABS(x + ((i % 3) - 1));
> > > +        int yoff = FFABS(y + (i / 3) - 1);
> > > +
> > > +        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
> > > +        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
> > > +
> > > +        c[i] = src + xoff * bpc + yoff * stride;
> > > +    }
> > > +}
> > > +
> > > +static void filter_sobel(uint8_t *dst, int width,
> > > +                         float scale, float delta, const int *const
> > > matrix,
> > > +                         const uint8_t *c[], int peak, int radius,
> > > +                         int dstride, int stride, int size)
> > > +{
> > > +    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
> > > +    const uint8_t *c3 = c[3], *c5 = c[5];
> > > +    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
> > > +    int x;
> > > +
> > > +    for (x = 0; x < width; x++) {
> > > +        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
> > > +                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
> > > +        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
> > > +                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
> > > +
> > > +        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale +
> > > delta);
> > > +    }
> > > +}
> > > +
> > > +static void filter16_sobel(uint8_t *dstp, int width,
> > > +                           float scale, float delta, const int *const
> > > matrix,
> > > +                           const uint8_t *c[], int peak, int radius,
> > > +                           int dstride, int stride, int size)
> > > +{
> > > +    uint16_t *dst = (uint16_t *)dstp;
> > > +    int x;
> > > +
> > > +    for (x = 0; x < width; x++) {
> > > +        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x])
> > > *
> > > -2 + AV_RN16A(&c[2][2 * x]) * -1 +
> > > +                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x])
> > > *  2 + AV_RN16A(&c[8][2 * x]) *  1;
> > > +        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x])
> > > *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
> > > +                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x])
> > > *
> > > -1 + AV_RN16A(&c[8][2 * x]) *  1;
> > > +
> > > +        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0,
> > > peak);
> > > +    }
> > > +}
> > > +
> > > +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int
> > > nb_planes)
> > > +{
> > > +    for (int i = 0; i < 4; i++) {
> > > +        s->filter[i] = filter_sobel;
> > > +        s->copy[i] = !((1 << i) & s->planes);
> > > +        s->size[i] = 3;
> > > +        s->setup[i] = setup_3x3;
> > > +        s->rdiv[i] = s->scale;
> > > +        s->bias[i] = s->delta;
> > > +    }
> > > +    if (s->depth > 8)
> > > +        for (int i = 0; i < 4; i++)
> > > +            s->filter[i] = filter16_sobel;
> > > +#if ARCH_X86_64
> > > +    ff_sobel_init_x86(s, depth, nb_planes);
> > > +#endif
> > > +}
> > >  #endif
> > > diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
> > > index 9a9c099e6d..7762fa2a05 100644
> > > --- a/libavfilter/vf_convolution.c
> > > +++ b/libavfilter/vf_convolution.c
> > > @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int
> > > width,
> > >      }
> > >  }
> > >  
> > > -static void filter16_sobel(uint8_t *dstp, int width,
> > > -                           float scale, float delta, const int *const
> > > matrix,
> > > -                           const uint8_t *c[], int peak, int radius,
> > > -                           int dstride, int stride, int size)
> > > -{
> > > -    uint16_t *dst = (uint16_t *)dstp;
> > > -    int x;
> > > -
> > > -    for (x = 0; x < width; x++) {
> > > -        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x])
> > > *
> > > -2 + AV_RN16A(&c[2][2 * x]) * -1 +
> > > -                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x])
> > > *  2 + AV_RN16A(&c[8][2 * x]) *  1;
> > > -        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x])
> > > *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
> > > -                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x])
> > > *
> > > -1 + AV_RN16A(&c[8][2 * x]) *  1;
> > > -
> > > -        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0,
> > > peak);
> > > -    }
> > > -}
> > > -
> > >  static void filter16_scharr(uint8_t *dstp, int width,
> > >                              float scale, float delta, const int *const
> > > matrix,
> > >                              const uint8_t *c[], int peak, int radius,
> > > @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width,
> > >      }
> > >  }
> > >  
> > > -static void filter_sobel(uint8_t *dst, int width,
> > > -                         float scale, float delta, const int *const
> > > matrix,
> > > -                         const uint8_t *c[], int peak, int radius,
> > > -                         int dstride, int stride, int size)
> > > -{
> > > -    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
> > > -    const uint8_t *c3 = c[3], *c5 = c[5];
> > > -    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
> > > -    int x;
> > > -
> > > -    for (x = 0; x < width; x++) {
> > > -        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
> > > -                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
> > > -        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
> > > -                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
> > > -
> > > -        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale +
> > > delta);
> > > -    }
> > > -}
> > > -
> > >  static void filter_scharr(uint8_t *dst, int width,
> > >                            float scale, float delta, const int *const
> > > matrix,
> > >                            const uint8_t *c[], int peak, int radius,
> > > @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height,
> > >      }
> > >  }
> > >  
> > > -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src,
> > > int
> > > stride,
> > > -                      int x, int w, int y, int h, int bpc)
> > > -{
> > > -    int i;
> > > -
> > > -    for (i = 0; i < 9; i++) {
> > > -        int xoff = FFABS(x + ((i % 3) - 1));
> > > -        int yoff = FFABS(y + (i / 3) - 1);
> > > -
> > > -        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
> > > -        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
> > > -
> > > -        c[i] = src + xoff * bpc + yoff * stride;
> > > -    }
> > > -}
> > > -
> > >  static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src,
> > > int
> > > stride,
> > >                        int x, int w, int y, int h, int bpc)
> > >  {
> > > @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx)
> > >      const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
> > >      int p, i;
> > >  
> > > +    s->depth = desc->comp[0].depth;
> > > +    s->max = (1 << s->depth) - 1;
> > > +
> > > +    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc-
> > > > log2_chroma_w);
> > > +    s->planewidth[0] = s->planewidth[3] = inlink->w;
> > > +    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h,
> > > desc-
> > > > log2_chroma_h);
> > > +    s->planeheight[0] = s->planeheight[3] = inlink->h;
> > > +
> > > +    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> > > +    s->nb_threads = ff_filter_get_nb_threads(ctx);
> > > +    s->bpc = (s->depth + 7) / 8;
> > > +
> > >      if (!strcmp(ctx->filter->name, "convolution")) {
> > >          for (i = 0; i < 4; i++) {
> > >              int *matrix = (int *)s->matrix[i];
> > > @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx)
> > >              s->bias[i] = s->delta;
> > >          }
> > >      } else if (!strcmp(ctx->filter->name, "sobel")) {
> > > -        for (i = 0; i < 4; i++) {
> > > -            s->filter[i] = filter_sobel;
> > > -            s->copy[i] = !((1 << i) & s->planes);
> > > -            s->size[i] = 3;
> > > -            s->setup[i] = setup_3x3;
> > > -            s->rdiv[i] = s->scale;
> > > -            s->bias[i] = s->delta;
> > > -        }
> > > +        ff_sobel_init(s, s->depth, s->nb_planes);
> > >      } else if (!strcmp(ctx->filter->name, "kirsch")) {
> > >          for (i = 0; i < 4; i++) {
> > >              s->filter[i] = filter_kirsch;
> > > @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx)
> > >          }
> > >      }
> > >  
> > > -    s->depth = desc->comp[0].depth;
> > > -    s->max = (1 << s->depth) - 1;
> > > -
> > > -    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc-
> > > > log2_chroma_w);
> > > -    s->planewidth[0] = s->planewidth[3] = inlink->w;
> > > -    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h,
> > > desc-
> > > > log2_chroma_h);
> > > -    s->planeheight[0] = s->planeheight[3] = inlink->h;
> > > -
> > > -    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> > > -    s->nb_threads = ff_filter_get_nb_threads(ctx);
> > > -    s->bpc = (s->depth + 7) / 8;
> > > -
> > >      if (!strcmp(ctx->filter->name, "convolution")) {
> > >          if (s->depth > 8) {
> > >              for (p = 0; p < s->nb_planes; p++) {
> > > @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx)
> > >          if (s->depth > 8)
> > >              for (p = 0; p < s->nb_planes; p++)
> > >                  s->filter[p] = filter16_roberts;
> > > -    } else if (!strcmp(ctx->filter->name, "sobel")) {
> > > -        if (s->depth > 8)
> > > -            for (p = 0; p < s->nb_planes; p++)
> > > -                s->filter[p] = filter16_sobel;
> > >      } else if (!strcmp(ctx->filter->name, "kirsch")) {
> > >          if (s->depth > 8)
> > >              for (p = 0; p < s->nb_planes; p++)
> > > diff --git a/libavfilter/x86/vf_convolution.asm
> > > b/libavfilter/x86/vf_convolution.asm
> > > index 754d4d1064..c912d56752 100644
> > > --- a/libavfilter/x86/vf_convolution.asm
> > > +++ b/libavfilter/x86/vf_convolution.asm
> > > @@ -22,6 +22,18 @@
> > >  
> > >  SECTION_RODATA
> > >  half:   dd 0.5
> > > +data_p1: dd  1
> > > +data_n1: dd -1
> > > +data_p2: dd  2
> > > +data_n2: dd -2
> > > +
> > > +ALIGN 64
> > > +sobel_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19,
> > > 35,
> > > 51
> > > +            db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23,
> > > 39,
> > > 55
> > > +            db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27,
> > > 43,
> > > 59
> > > +            db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31,
> > > 47,
> > > 63
> > > +sobel_mulA: db -1,  1, -2,  2
> > > +sobel_mulB: db  1, -1,  2, -2
> > >  
> > >  SECTION .text
> > >  
> > > @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv,
> > > bias,
> > > matrix, ptr, c0, c1, c2, c
> > >  INIT_XMM sse4
> > >  FILTER_3X3
> > >  %endif
> > > +
> > > +%macro SOBEL_MUL 2
> > > +    movzx ptrd, byte [c%1q + xq]
> > > +    imul  ptrd, [%2]
> > > +    add   rd, ptrd
> > > +%endmacro
> > > +
> > > +%macro SOBEL_ADD 1
> > > +    movzx ptrd, byte [c%1q + xq]
> > > +    add   rd, ptrd
> > > +%endmacro
> > > +
> > > +; void filter_sobel_avx512(uint8_t *dst, int width,
> > > +;                      float scale, float delta, const int *const matrix,
> > > +;                      const uint8_t *c[], int peak, int radius,
> > > +;                      int dstride, int stride)
> > > +%macro FILTER_SOBEL 0
> > > +%if UNIX64
> > > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3,
> > > c4,
> > > c5, c6, c7, c8, r, x
> > > +%else
> > > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0,
> > > c1,
> > > c2, c3, c4, c5, c6, c7, c8, r, x
> > > +%endif
> > > +%if WIN64
> > > +    SWAP xmm0, xmm2
> > > +    SWAP xmm1, xmm3
> > > +    mov  r2q, matrixmp
> > > +    mov  r3q, ptrmp
> > > +    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7,
> > > c8,
> > > r, x
> > > +%endif
> > > +    movsxdifnidn widthq, widthd
> > > +    VBROADCASTSS m0, xmm0
> > > +    VBROADCASTSS m1, xmm1
> > > +    pxor  m6, m6
> > > +    mov   c0q, [ptrq + 0*gprsize]
> > > +    mov   c1q, [ptrq + 1*gprsize]
> > > +    mov   c2q, [ptrq + 2*gprsize]
> > > +    mov   c3q, [ptrq + 3*gprsize]
> > > +    mov   c4q, [ptrq + 4*gprsize]
> > > +    mov   c5q, [ptrq + 5*gprsize]
> > > +    mov   c6q, [ptrq + 6*gprsize]
> > > +    mov   c7q, [ptrq + 7*gprsize]
> > > +    mov   c8q, [ptrq + 8*gprsize]
> > > +
> > > +    xor   xq, xq
> > > +    cmp   widthq, mmsize/4
> > > +    jl .loop2
> > > +
> > > +    mov   rq, widthq
> > > +    and   rq, mmsize/4-1
> > > +    sub   widthq, rq
> > > +
> > > +    mova  m6, [sobel_perm]
> > > +.loop1:
> > > +    movu          xm3, [c2q + xq]
> > > +    pmovzxbd      m5, [c0q + xq]
> > > +    vinserti32x4  ym3, [c6q + xq], 1
> > > +    pmovzxbd      m4, [c8q + xq]
> > > +    vinserti32x4  m2, m3, [c1q + xq], 2
> > > +    vinserti32x4  m3, [c5q + xq], 2
> > > +    vinserti32x4  m2, [c7q + xq], 3
> > > +    vinserti32x4  m3, [c3q + xq], 3
> > > +    vpermb        m2, m6, m2
> > > +    psubd         m4, m5
> > > +    vpermb        m3, m6, m3
> > > +    mova          m5, m4
> > > +    vpdpbusd      m4, m2, [sobel_mulA] {1to16}
> > > +    vpdpbusd      m5, m3, [sobel_mulB] {1to16}
> > > +
> > > +    cvtdq2ps  m4, m4
> > > +    mulps     m4, m4
> > > +
> > > +    cvtdq2ps    m5, m5
> > > +    VFMADD231PS m4, m5, m5
> > > +
> > > +    sqrtps    m4, m4
> > > +    fmaddps m4, m4, m0, m1
> > > +    cvttps2dq m4, m4
> > > +    vpmovusdb [dstq + xq], m4
> > > +
> > > +    add xq, mmsize/4
> > > +    cmp xq, widthq
> > > +    jl .loop1
> > > +
> > > +    add widthq, rq
> > > +    cmp xq, widthq
> > > +    jge .end
> > > +
> > > +.loop2:
> > > +    xor  rd, rd
> > > +    pxor m4, m4
> > > +
> > > +    ;Gx
> > > +    SOBEL_MUL 0, data_n1
> > > +    SOBEL_MUL 1, data_n2
> > > +    SOBEL_MUL 2, data_n1
> > > +    SOBEL_ADD 6
> > > +    SOBEL_MUL 7, data_p2
> > > +    SOBEL_ADD 8
> > > +
> > > +    cvtsi2ss xmm4, rd
> > > +    mulss    xmm4, xmm4
> > > +
> > > +    xor rd, rd
> > > +    ;Gy
> > > +    SOBEL_MUL 0, data_n1
> > > +    SOBEL_ADD 2
> > > +    SOBEL_MUL 3, data_n2
> > > +    SOBEL_MUL 5, data_p2
> > > +    SOBEL_MUL 6, data_n1
> > > +    SOBEL_ADD 8
> > > +
> > > +    cvtsi2ss  xmm5, rd
> > > +    fmaddss xmm4, xmm5, xmm5, xmm4
> > > +
> > > +    sqrtps    xmm4, xmm4
> > > +    fmaddss   xmm4, xmm4, xmm0, xmm1     ;sum = sum * rdiv + bias
> > > +    cvttps2dq xmm4, xmm4     ; trunc to integer
> > > +    packssdw  xmm4, xmm4
> > > +    packuswb  xmm4, xmm4
> > > +    movd      rd, xmm4
> > > +    mov       [dstq + xq], rb
> > > +
> > > +    add xq, 1
> > > +    cmp xq, widthq
> > > +    jl .loop2
> > > +.end:
> > > +    RET
> > > +%endmacro
> > > +
> > > +%if ARCH_X86_64
> > > +%if HAVE_AVX512ICL_EXTERNAL
> > > +INIT_ZMM avx512icl
> > > +FILTER_SOBEL
> > > +%endif
> > > +%endif
> > > diff --git a/libavfilter/x86/vf_convolution_init.c
> > > b/libavfilter/x86/vf_convolution_init.c
> > > index b78a47d02b..bff10ca1a4 100644
> > > --- a/libavfilter/x86/vf_convolution_init.c
> > > +++ b/libavfilter/x86/vf_convolution_init.c
> > > @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
> > >                          const uint8_t *c[], int peak, int radius,
> > >                          int dstride, int stride, int size);
> > >  
> > > +void ff_filter_sobel_avx512icl(uint8_t *dst, int width,
> > > +                         float scale, float delta, const int *const
> > > matrix,
> > > +                         const uint8_t *c[], int peak, int radius,
> > > +                         int dstride, int stride, int size);
> > > +
> > >  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
> > >  {
> > >  #if ARCH_X86_64
> > > @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext
> > > *s)
> > >      }
> > >  #endif
> > >  }
> > > +
> > > +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int
> > > nb_planes)
> > > +{
> > > +#if ARCH_X86_64
> > > +    int cpu_flags = av_get_cpu_flags();
> > > +    for (int i = 0; i < nb_planes; i++) {
> > > +        if (depth == 8) {
> > > +            if (EXTERNAL_AVX512ICL(cpu_flags))
> > > +                s->filter[i] = ff_filter_sobel_avx512icl;
> > > +        }
> > > +    }
> > > +#endif
> > > +}
> > > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> > > index 62d6616faf..a6f06c7007 100644
> > > --- a/tests/checkasm/Makefile
> > > +++ b/tests/checkasm/Makefile
> > > @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
> > >  AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
> > >  AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
> > >  AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
> > > +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
> > >  
> > >  CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
> > >  
> > > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> > > index 421bd096c5..3eb4780a64 100644
> > > --- a/tests/checkasm/checkasm.c
> > > +++ b/tests/checkasm/checkasm.c
> > > @@ -197,6 +197,9 @@ static const struct {
> > >      #if CONFIG_THRESHOLD_FILTER
> > >          { "vf_threshold", checkasm_check_vf_threshold },
> > >      #endif
> > > +    #if CONFIG_SOBEL_FILTER
> > > +        { "vf_sobel", checkasm_check_vf_sobel },
> > > +    #endif
> > >  #endif
> > >  #if CONFIG_SWSCALE
> > >      { "sw_gbrp", checkasm_check_sw_gbrp },
> > > diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> > > index ee9151410e..214918e7ea 100644
> > > --- a/tests/checkasm/checkasm.h
> > > +++ b/tests/checkasm/checkasm.h
> > > @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void);
> > >  void checkasm_check_vf_gblur(void);
> > >  void checkasm_check_vf_hflip(void);
> > >  void checkasm_check_vf_threshold(void);
> > > +void checkasm_check_vf_sobel(void);
> > >  void checkasm_check_vp8dsp(void);
> > >  void checkasm_check_vp9dsp(void);
> > >  void checkasm_check_videodsp(void);
> > > diff --git a/tests/checkasm/vf_convolution.c
> > > b/tests/checkasm/vf_convolution.c
> > > new file mode 100644
> > > index 0000000000..007865863e
> > > --- /dev/null
> > > +++ b/tests/checkasm/vf_convolution.c
> > > @@ -0,0 +1,104 @@
> > > +/*
> > > + * This file is part of FFmpeg.
> > > + *
> > > + * FFmpeg is free software; you can redistribute it and/or modify
> > > + * it under the terms of the GNU General Public License as published by
> > > + * the Free Software Foundation; either version 2 of the License, or
> > > + * (at your option) any later version.
> > > + *
> > > + * FFmpeg is distributed in the hope that it will be useful,
> > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > + * GNU General Public License for more details.
> > > + *
> > > + * You should have received a copy of the GNU General Public License
> > > along
> > > + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> > > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> > > + */
> > > +
> > > +#include <string.h>
> > > +#include "checkasm.h"
> > > +#include "libavfilter/avfilter.h"
> > > +#include "libavfilter/convolution.h"
> > > +#include "libavutil/intreadwrite.h"
> > > +#include "libavutil/mem_internal.h"
> > > +
> > > +#define WIDTH 512
> > > +#define HEIGHT 512
> > > +#define SRC_STRIDE 512
> > > +#define PIXELS (WIDTH * HEIGHT)
> > > +
> > > +#define randomize_buffers(buf, size)      \
> > > +    do {                                  \
> > > +        int j;                            \
> > > +        uint8_t *tmp_buf = (uint8_t *)buf;\
> > > +        for (j = 0; j< size; j++)         \
> > > +            tmp_buf[j] = rnd() & 0xFF;    \
> > > +    } while (0)
> > > +
> > > +static void check_sobel(const char * report_name)
> > > +{
> > > +    LOCAL_ALIGNED_32(uint8_t, src,     [PIXELS]);
> > > +    LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]);
> > > +    LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]);
> > > +    const int height = WIDTH;
> > > +    const int width  = HEIGHT;
> > > +    const int stride = SRC_STRIDE;
> > > +    const int dstride = SRC_STRIDE;
> > > +    int mode = 0;
> > > +    const uint8_t *c[49];
> > > +    const int radius = 1;
> > > +    const int bpc = 1;
> > > +    const int step = mode == MATRIX_COLUMN ? 16 : 1;
> > > +    const int slice_start = 0;
> > > +    const int slice_end = height;
> > > +    int y;
> > > +    const int sizew = mode == MATRIX_COLUMN ? height : width;
> > > +    float scale = 2;
> > > +    float delta = 10;
> > > +
> > > +    ConvolutionContext s;
> > > +
> > > +    declare_func(void, uint8_t *dst, int width, float scale, float delta,
> > > const int *const matrix,
> > > +                 const uint8_t *c[], int peak, int radius, int dstride,
> > > int
> > > stride, int size);
> > > +
> > > +    s.scale = scale;
> > > +    s.delta = delta;
> > > +    s.depth = 8;
> > > +    s.nb_planes = 3;
> > > +    s.planes = 15;
> > > +    ff_sobel_init(&s, s.depth, s.nb_planes);
> > > +
> > > +    memset(dst_ref, 0, PIXELS);
> > > +    memset(dst_new, 0, PIXELS);
> > > +    randomize_buffers(src, PIXELS);
> > > +
> > > +    if (check_func(s.filter[0], "%s", report_name)) {
> > > +        for (y = slice_start; y < slice_end; y += step) {
> > > +            const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) *
> > > bpc
> > > : radius * bpc;
> > > +            const int yoff = mode == MATRIX_COLUMN ? radius * dstride :
> > > 0;
> > > +
> > > +            s.setup[0](radius, c, src, stride, radius, width, y, height,
> > > bpc);
> > > +            call_ref(dst_ref + yoff + xoff, sizew - 2 * radius,
> > > +                     scale, delta, NULL, c, 0, radius,
> > > +                     dstride, stride, slice_end - step);
> > > +            call_new(dst_new + yoff + xoff, sizew - 2 * radius,
> > > +                     scale, delta, NULL, c, 0, radius,
> > > +                     dstride, stride, slice_end - step);
> > > +            if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff,
> > > slice_end - step))
> > > +                fail();
> > > +            bench_new(dst_new + yoff + xoff, sizew - 2 * radius,
> > > +                      scale, delta, NULL, c, 0, radius,
> > > +                      dstride, stride, slice_end - step);
> > > +            if (mode != MATRIX_COLUMN)
> > > +                dst_ref += dstride;
> > > +        }
> > > +    }
> > > +
> > > +}
> > > +
> > > +void checkasm_check_vf_sobel(void)
> > > +{
> > > +    check_sobel("sobel");
> > > +    report("convolution:sobel");
> > > +}
> > > diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
> > > index aa9b288e12..a4e95541f5 100644
> > > --- a/tests/fate/checkasm.mak
> > > +++ b/tests/fate/checkasm.mak
> > > @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm-
> > > aacpsdsp                                  \
> > >                  fate-checkasm-vf_hflip                                  \
> > >                  fate-checkasm-vf_nlmeans                                \
> > >                  fate-checkasm-vf_threshold                              \
> > > +                fate-checkasm-vf_sobel                                  \
> > >                  fate-checkasm-videodsp                                  \
> > >                  fate-checkasm-vorbisdsp                                 \
> > >                  fate-checkasm-vp8dsp                                    \
> > 
> > LGTM and it works well for me, I saw a significant FPS improvement when
> > running
> > the command below. 
> > 
> > $ ffmpeg -i 1920x1080.mp4 -vf "sobel" -f null -
> > 
> 
> Does anyone else have any thought on this patch? I will merge it if there are
> no
> more comments. 

Pushed, 

-Haihao
James Almer Nov. 14, 2022, 2:42 a.m. UTC | #4
On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote:
> +%macro FILTER_SOBEL 0
> +%if UNIX64
> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
> +%else
> +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
> +%endif
> +%if WIN64
> +    SWAP xmm0, xmm2
> +    SWAP xmm1, xmm3
> +    mov  r2q, matrixmp
> +    mov  r3q, ptrmp
> +    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
> +%endif
> +    movsxdifnidn widthq, widthd
> +    VBROADCASTSS m0, xmm0
> +    VBROADCASTSS m1, xmm1

This and every other xmm# case should instead be xm#, to ensure the 
swapping is taken into account.
Wang, Bin Nov. 14, 2022, 5:58 a.m. UTC | #5
-----Original Message-----
From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James Almer
Sent: Monday, November 14, 2022 10:43 AM
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI

On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote:
> +%macro FILTER_SOBEL 0
> +%if UNIX64
> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, 
> +c3, c4, c5, c6, c7, c8, r, x %else cglobal filter_sobel, 4, 15, 7, 
> +dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, 
> +c8, r, x %endif %if WIN64
> +    SWAP xmm0, xmm2
> +    SWAP xmm1, xmm3
> +    mov  r2q, matrixmp
> +    mov  r3q, ptrmp
> +    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, 
> +c7, c8, r, x %endif
> +    movsxdifnidn widthq, widthd
> +    VBROADCASTSS m0, xmm0
> +    VBROADCASTSS m1, xmm1

> + This and every other xmm# case should instead be xm#, to ensure the swapping is taken into account.

Sorry, I can't get your point, could you please help to explain why I have to use xm# to ensure the swapping operation(swap xmm# can't work in WIN64 asm)? And How to do it ?
James Almer Nov. 14, 2022, 11:34 a.m. UTC | #6
On 11/14/2022 2:58 AM, Wang, Bin wrote:
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James Almer
> Sent: Monday, November 14, 2022 10:43 AM
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI
> 
> On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote:
>> +%macro FILTER_SOBEL 0
>> +%if UNIX64
>> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2,
>> +c3, c4, c5, c6, c7, c8, r, x %else cglobal filter_sobel, 4, 15, 7,
>> +dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7,
>> +c8, r, x %endif %if WIN64
>> +    SWAP xmm0, xmm2
>> +    SWAP xmm1, xmm3
>> +    mov  r2q, matrixmp
>> +    mov  r3q, ptrmp
>> +    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6,
>> +c7, c8, r, x %endif
>> +    movsxdifnidn widthq, widthd
>> +    VBROADCASTSS m0, xmm0
>> +    VBROADCASTSS m1, xmm1
> 
>> + This and every other xmm# case should instead be xm#, to ensure the swapping is taken into account.
> 
> Sorry, I can't get your point, could you please help to explain why I have to use xm# to ensure the swapping operation(swap xmm# can't work in WIN64 asm)? And How to do it ?

SWAP only affects the x86inc defined macros m#, xm#, ym#, and zm#, so 
those instructions above end up encoded as vbroadcastss zmm2, xmm0 and
vbroadcastss zmm3, xmm1 on WIN64.
In fact, now that i check it they end up as vbroadcastss zmm18, xmm0 and 
vbroadcastss zmm19, xmm1 because x86inc is purposely using the higher 16 
regs with these macros on all targets to avoid having to call vzeroupper 
at the end. This works on unix64 by pure chance because the floats were 
effectively in xmm0 and xmm1 and all calculations then happen on m#, xm# 
and ym#.

So you'll have to duplicate the VBROADCASTSS lines to broadcast xmm2 and 
xmm3 to m0 and m1 on WIN64 instead of using SWAP.
James Almer Nov. 14, 2022, 12:54 p.m. UTC | #7
On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote:
> +.loop2:
> +    xor  rd, rd
> +    pxor m4, m4
> +
> +    ;Gx
> +    SOBEL_MUL 0, data_n1
> +    SOBEL_MUL 1, data_n2
> +    SOBEL_MUL 2, data_n1
> +    SOBEL_ADD 6
> +    SOBEL_MUL 7, data_p2
> +    SOBEL_ADD 8
> +
> +    cvtsi2ss xmm4, rd
> +    mulss    xmm4, xmm4
> +
> +    xor rd, rd
> +    ;Gy
> +    SOBEL_MUL 0, data_n1
> +    SOBEL_ADD 2
> +    SOBEL_MUL 3, data_n2
> +    SOBEL_MUL 5, data_p2
> +    SOBEL_MUL 6, data_n1
> +    SOBEL_ADD 8
> +
> +    cvtsi2ss  xmm5, rd
> +    fmaddss xmm4, xmm5, xmm5, xmm4
> +
> +    sqrtps    xmm4, xmm4
> +    fmaddss   xmm4, xmm4, xmm0, xmm1     ;sum = sum * rdiv + bias

By using xmm# you're not taking into account any x86inc SWAPing, so this 
is using xmm0 and xmm1 where the single scalar float input arguments 
reside (at least on unix64), instead of xm0 and xm1 (xmm16 and xmm17) 
where the broadcasted scalars were stored.
This, again, only worked by chance on unix64 because you're using scalar 
fmadd, and shouldn't work at all on win64.

Also, all these as is are being encoded as VEX, not EVEX, but it should 
be fine leaving them untouched instead of using xm#, since they will be 
shorter (five bytes instead of six for some) by using the lower, non 
callee-saved regs.

> +    cvttps2dq xmm4, xmm4     ; trunc to integer
> +    packssdw  xmm4, xmm4
> +    packuswb  xmm4, xmm4
> +    movd      rd, xmm4
> +    mov       [dstq + xq], rb
> +
> +    add xq, 1
> +    cmp xq, widthq
> +    jl .loop2
> +.end:
> +    RET
Wang, Bin Nov. 14, 2022, 1:30 p.m. UTC | #8
> By using xmm# you're not taking into account any x86inc SWAPing, so this is
> using xmm0 and xmm1 where the single scalar float input arguments reside (at
> least on unix64), instead of xm0 and xm1 (xmm16 and xmm17) where the
> broadcasted scalars were stored.
> This, again, only worked by chance on unix64 because you're using scalar fmadd,
> and shouldn't work at all on win64.
> 
> Also, all these as is are being encoded as VEX, not EVEX, but it should be fine
> leaving them untouched instead of using xm#, since they will be shorter (five
> bytes instead of six for some) by using the lower, non callee-saved regs.

Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is change the WIN64 swap from:
SWAP xmm0, xmm2
SWAP xmm1, xmm3
To:
VBROADCASTSS m0, xmm2
VBROADCASTSS m1, xmm3

Is that correct?
James Almer Nov. 14, 2022, 1:35 p.m. UTC | #9
On 11/14/2022 10:30 AM, Wang, Bin wrote:
>> By using xmm# you're not taking into account any x86inc SWAPing, so this is
>> using xmm0 and xmm1 where the single scalar float input arguments reside (at
>> least on unix64), instead of xm0 and xm1 (xmm16 and xmm17) where the
>> broadcasted scalars were stored.
>> This, again, only worked by chance on unix64 because you're using scalar fmadd,
>> and shouldn't work at all on win64.
>>
>> Also, all these as is are being encoded as VEX, not EVEX, but it should be fine
>> leaving them untouched instead of using xm#, since they will be shorter (five
>> bytes instead of six for some) by using the lower, non callee-saved regs.
> 
> Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is change the WIN64 swap from:
> SWAP xmm0, xmm2
> SWAP xmm1, xmm3
> To:
> VBROADCASTSS m0, xmm2
> VBROADCASTSS m1, xmm3
> 
> Is that correct?

Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 to 
zmm16 and zmm17.
After that what you need to do is either change the fmaddss instruction 
to use xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 
with EVEX encoding is used), or much like the broadcast above use xmm2 
and xmm3 explicitly on win64, so it remains VEX encoded.
Wang, Bin Nov. 14, 2022, 1:54 p.m. UTC | #10
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James
> Almer
> Sent: Monday, November 14, 2022 9:36 PM
> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add
> sobel filter optimization and unit test with intel AVX512 VNNI
> 
> On 11/14/2022 10:30 AM, Wang, Bin wrote:
> >> By using xmm# you're not taking into account any x86inc SWAPing, so
> >> this is using xmm0 and xmm1 where the single scalar float input
> >> arguments reside (at least on unix64), instead of xm0 and xm1 (xmm16
> >> and xmm17) where the broadcasted scalars were stored.
> >> This, again, only worked by chance on unix64 because you're using
> >> scalar fmadd, and shouldn't work at all on win64.
> >>
> >> Also, all these as is are being encoded as VEX, not EVEX, but it
> >> should be fine leaving them untouched instead of using xm#, since
> >> they will be shorter (five bytes instead of six for some) by using the lower,
> non callee-saved regs.
> >
> > Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is
> change the WIN64 swap from:
> > SWAP xmm0, xmm2
> > SWAP xmm1, xmm3
> > To:
> > VBROADCASTSS m0, xmm2
> > VBROADCASTSS m1, xmm3
> >
> > Is that correct?
> 
> Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 to
> zmm16 and zmm17.
> After that what you need to do is either change the fmaddss instruction to use
> xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 with
> EVEX encoding is used), or much like the broadcast above use xmm2 and xmm3
> explicitly on win64, so it remains VEX encoded.

So, to fix the issue, does this 2 changes looks good for you?
First change the WIN64 swap from:
SWAP xmm0, xmm2
SWAP xmm1, xmm3
To:
VBROADCASTSS m0, xmm2
VBROADCASTSS m1, xmm3

Second change the fmaddss from:
fmaddss   xmm4, xmm4, xmm0, xmm1
To:
fmaddss   xmm4, xmm4, xm0, xm1


> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
James Almer Nov. 14, 2022, 2:31 p.m. UTC | #11
On 11/14/2022 10:54 AM, Wang, Bin wrote:
> 
> 
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James
>> Almer
>> Sent: Monday, November 14, 2022 9:36 PM
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add
>> sobel filter optimization and unit test with intel AVX512 VNNI
>>
>> On 11/14/2022 10:30 AM, Wang, Bin wrote:
>>>> By using xmm# you're not taking into account any x86inc SWAPing, so
>>>> this is using xmm0 and xmm1 where the single scalar float input
>>>> arguments reside (at least on unix64), instead of xm0 and xm1 (xmm16
>>>> and xmm17) where the broadcasted scalars were stored.
>>>> This, again, only worked by chance on unix64 because you're using
>>>> scalar fmadd, and shouldn't work at all on win64.
>>>>
>>>> Also, all these as is are being encoded as VEX, not EVEX, but it
>>>> should be fine leaving them untouched instead of using xm#, since
>>>> they will be shorter (five bytes instead of six for some) by using the lower,
>> non callee-saved regs.
>>>
>>> Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is
>> change the WIN64 swap from:
>>> SWAP xmm0, xmm2
>>> SWAP xmm1, xmm3
>>> To:
>>> VBROADCASTSS m0, xmm2
>>> VBROADCASTSS m1, xmm3
>>>
>>> Is that correct?
>>
>> Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 to
>> zmm16 and zmm17.
>> After that what you need to do is either change the fmaddss instruction to use
>> xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 with
>> EVEX encoding is used), or much like the broadcast above use xmm2 and xmm3
>> explicitly on win64, so it remains VEX encoded.
> 
> So, to fix the issue, does this 2 changes looks good for you?
> First change the WIN64 swap from:
> SWAP xmm0, xmm2
> SWAP xmm1, xmm3
> To:
> VBROADCASTSS m0, xmm2
> VBROADCASTSS m1, xmm3
> 
> Second change the fmaddss from:
> fmaddss   xmm4, xmm4, xmm0, xmm1
> To:
> fmaddss   xmm4, xmm4, xm0, xm1

Yes.
Wang, Bin Nov. 14, 2022, 3:18 p.m. UTC | #12
> >> On 11/14/2022 10:30 AM, Wang, Bin wrote:
> >>>> By using xmm# you're not taking into account any x86inc SWAPing, so
> >>>> this is using xmm0 and xmm1 where the single scalar float input
> >>>> arguments reside (at least on unix64), instead of xm0 and xm1
> >>>> (xmm16 and xmm17) where the broadcasted scalars were stored.
> >>>> This, again, only worked by chance on unix64 because you're using
> >>>> scalar fmadd, and shouldn't work at all on win64.
> >>>>
> >>>> Also, all these as is are being encoded as VEX, not EVEX, but it
> >>>> should be fine leaving them untouched instead of using xm#, since
> >>>> they will be shorter (five bytes instead of six for some) by using
> >>>> the lower,
> >> non callee-saved regs.
> >>>
> >>> Thanks for the help. I'm not familiar with WIN64 asm. So what I need
> >>> to do is
> >> change the WIN64 swap from:
> >>> SWAP xmm0, xmm2
> >>> SWAP xmm1, xmm3
> >>> To:
> >>> VBROADCASTSS m0, xmm2
> >>> VBROADCASTSS m1, xmm3
> >>>
> >>> Is that correct?
> >>
> >> Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3
> >> to
> >> zmm16 and zmm17.
> >> After that what you need to do is either change the fmaddss
> >> instruction to use
> >> xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17
> with
> >> EVEX encoding is used), or much like the broadcast above use xmm2 and
> >> xmm3 explicitly on win64, so it remains VEX encoded.
> >
> > So, to fix the issue, does this 2 changes looks good for you?
> > First change the WIN64 swap from:
> > SWAP xmm0, xmm2
> > SWAP xmm1, xmm3
> > To:
> > VBROADCASTSS m0, xmm2
> > VBROADCASTSS m1, xmm3
> >
> > Second change the fmaddss from:
> > fmaddss   xmm4, xmm4, xmm0, xmm1
> > To:
> > fmaddss   xmm4, xmm4, xm0, xm1
> 
> Yes.

Appreciate for your help, I commit new patch here:
https://patchwork.ffmpeg.org/project/ffmpeg/patch/20221114143551.9740-1-bin.wang@intel.com/

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org
> with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h
index 88aabe9a20..e44bfb5da8 100644
--- a/libavfilter/convolution.h
+++ b/libavfilter/convolution.h
@@ -21,6 +21,7 @@ 
 #ifndef AVFILTER_CONVOLUTION_H
 #define AVFILTER_CONVOLUTION_H
 #include "avfilter.h"
+#include "libavutil/intreadwrite.h"
 
 enum MatrixMode {
     MATRIX_SQUARE,
@@ -61,4 +62,77 @@  typedef struct ConvolutionContext {
 } ConvolutionContext;
 
 void ff_convolution_init_x86(ConvolutionContext *s);
+void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes);
+
+static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int stride,
+                      int x, int w, int y, int h, int bpc)
+{
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        int xoff = FFABS(x + ((i % 3) - 1));
+        int yoff = FFABS(y + (i / 3) - 1);
+
+        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
+        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
+
+        c[i] = src + xoff * bpc + yoff * stride;
+    }
+}
+
+static void filter_sobel(uint8_t *dst, int width,
+                         float scale, float delta, const int *const matrix,
+                         const uint8_t *c[], int peak, int radius,
+                         int dstride, int stride, int size)
+{
+    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
+    const uint8_t *c3 = c[3], *c5 = c[5];
+    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
+    int x;
+
+    for (x = 0; x < width; x++) {
+        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
+                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
+        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
+                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
+
+        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta);
+    }
+}
+
+static void filter16_sobel(uint8_t *dstp, int width,
+                           float scale, float delta, const int *const matrix,
+                           const uint8_t *c[], int peak, int radius,
+                           int dstride, int stride, int size)
+{
+    uint16_t *dst = (uint16_t *)dstp;
+    int x;
+
+    for (x = 0; x < width; x++) {
+        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 +
+                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x]) *  2 + AV_RN16A(&c[8][2 * x]) *  1;
+        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
+                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) *  1;
+
+        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, peak);
+    }
+}
+
+static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int nb_planes)
+{
+    for (int i = 0; i < 4; i++) {
+        s->filter[i] = filter_sobel;
+        s->copy[i] = !((1 << i) & s->planes);
+        s->size[i] = 3;
+        s->setup[i] = setup_3x3;
+        s->rdiv[i] = s->scale;
+        s->bias[i] = s->delta;
+    }
+    if (s->depth > 8)
+        for (int i = 0; i < 4; i++)
+            s->filter[i] = filter16_sobel;
+#if ARCH_X86_64
+    ff_sobel_init_x86(s, depth, nb_planes);
+#endif
+}
 #endif
diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
index 9a9c099e6d..7762fa2a05 100644
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@@ -139,24 +139,6 @@  static void filter16_roberts(uint8_t *dstp, int width,
     }
 }
 
-static void filter16_sobel(uint8_t *dstp, int width,
-                           float scale, float delta, const int *const matrix,
-                           const uint8_t *c[], int peak, int radius,
-                           int dstride, int stride, int size)
-{
-    uint16_t *dst = (uint16_t *)dstp;
-    int x;
-
-    for (x = 0; x < width; x++) {
-        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 +
-                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x]) *  2 + AV_RN16A(&c[8][2 * x]) *  1;
-        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
-                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) *  1;
-
-        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, peak);
-    }
-}
-
 static void filter16_scharr(uint8_t *dstp, int width,
                             float scale, float delta, const int *const matrix,
                             const uint8_t *c[], int peak, int radius,
@@ -261,26 +243,6 @@  static void filter_roberts(uint8_t *dst, int width,
     }
 }
 
-static void filter_sobel(uint8_t *dst, int width,
-                         float scale, float delta, const int *const matrix,
-                         const uint8_t *c[], int peak, int radius,
-                         int dstride, int stride, int size)
-{
-    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
-    const uint8_t *c3 = c[3], *c5 = c[5];
-    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
-    int x;
-
-    for (x = 0; x < width; x++) {
-        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
-                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
-        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
-                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
-
-        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta);
-    }
-}
-
 static void filter_scharr(uint8_t *dst, int width,
                           float scale, float delta, const int *const matrix,
                           const uint8_t *c[], int peak, int radius,
@@ -552,22 +514,6 @@  static void filter_column(uint8_t *dst, int height,
     }
 }
 
-static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int stride,
-                      int x, int w, int y, int h, int bpc)
-{
-    int i;
-
-    for (i = 0; i < 9; i++) {
-        int xoff = FFABS(x + ((i % 3) - 1));
-        int yoff = FFABS(y + (i / 3) - 1);
-
-        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
-        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
-
-        c[i] = src + xoff * bpc + yoff * stride;
-    }
-}
-
 static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, int stride,
                       int x, int w, int y, int h, int bpc)
 {
@@ -708,6 +654,18 @@  static int param_init(AVFilterContext *ctx)
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
     int p, i;
 
+    s->depth = desc->comp[0].depth;
+    s->max = (1 << s->depth) - 1;
+
+    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
+    s->planewidth[0] = s->planewidth[3] = inlink->w;
+    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
+    s->planeheight[0] = s->planeheight[3] = inlink->h;
+
+    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
+    s->nb_threads = ff_filter_get_nb_threads(ctx);
+    s->bpc = (s->depth + 7) / 8;
+
     if (!strcmp(ctx->filter->name, "convolution")) {
         for (i = 0; i < 4; i++) {
             int *matrix = (int *)s->matrix[i];
@@ -804,14 +762,7 @@  static int param_init(AVFilterContext *ctx)
             s->bias[i] = s->delta;
         }
     } else if (!strcmp(ctx->filter->name, "sobel")) {
-        for (i = 0; i < 4; i++) {
-            s->filter[i] = filter_sobel;
-            s->copy[i] = !((1 << i) & s->planes);
-            s->size[i] = 3;
-            s->setup[i] = setup_3x3;
-            s->rdiv[i] = s->scale;
-            s->bias[i] = s->delta;
-        }
+        ff_sobel_init(s, s->depth, s->nb_planes);
     } else if (!strcmp(ctx->filter->name, "kirsch")) {
         for (i = 0; i < 4; i++) {
             s->filter[i] = filter_kirsch;
@@ -832,18 +783,6 @@  static int param_init(AVFilterContext *ctx)
         }
     }
 
-    s->depth = desc->comp[0].depth;
-    s->max = (1 << s->depth) - 1;
-
-    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
-    s->planewidth[0] = s->planewidth[3] = inlink->w;
-    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
-    s->planeheight[0] = s->planeheight[3] = inlink->h;
-
-    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
-    s->nb_threads = ff_filter_get_nb_threads(ctx);
-    s->bpc = (s->depth + 7) / 8;
-
     if (!strcmp(ctx->filter->name, "convolution")) {
         if (s->depth > 8) {
             for (p = 0; p < s->nb_planes; p++) {
@@ -870,10 +809,6 @@  static int param_init(AVFilterContext *ctx)
         if (s->depth > 8)
             for (p = 0; p < s->nb_planes; p++)
                 s->filter[p] = filter16_roberts;
-    } else if (!strcmp(ctx->filter->name, "sobel")) {
-        if (s->depth > 8)
-            for (p = 0; p < s->nb_planes; p++)
-                s->filter[p] = filter16_sobel;
     } else if (!strcmp(ctx->filter->name, "kirsch")) {
         if (s->depth > 8)
             for (p = 0; p < s->nb_planes; p++)
diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index 754d4d1064..c912d56752 100644
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -22,6 +22,18 @@ 
 
 SECTION_RODATA
 half:   dd 0.5
+data_p1: dd  1
+data_n1: dd -1
+data_p2: dd  2
+data_n2: dd -2
+
+ALIGN 64
+sobel_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
+            db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
+            db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
+            db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
+sobel_mulA: db -1,  1, -2,  2
+sobel_mulB: db  1, -1,  2, -2
 
 SECTION .text
 
@@ -154,3 +166,138 @@  cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c
 INIT_XMM sse4
 FILTER_3X3
 %endif
+
+%macro SOBEL_MUL 2
+    movzx ptrd, byte [c%1q + xq]
+    imul  ptrd, [%2]
+    add   rd, ptrd
+%endmacro
+
+%macro SOBEL_ADD 1
+    movzx ptrd, byte [c%1q + xq]
+    add   rd, ptrd
+%endmacro
+
+; void filter_sobel_avx512(uint8_t *dst, int width,
+;                      float scale, float delta, const int *const matrix,
+;                      const uint8_t *c[], int peak, int radius,
+;                      int dstride, int stride)
+%macro FILTER_SOBEL 0
+%if UNIX64
+cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
+%else
+cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
+%endif
+%if WIN64
+    SWAP xmm0, xmm2
+    SWAP xmm1, xmm3
+    mov  r2q, matrixmp
+    mov  r3q, ptrmp
+    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
+%endif
+    movsxdifnidn widthq, widthd
+    VBROADCASTSS m0, xmm0
+    VBROADCASTSS m1, xmm1
+    pxor  m6, m6
+    mov   c0q, [ptrq + 0*gprsize]
+    mov   c1q, [ptrq + 1*gprsize]
+    mov   c2q, [ptrq + 2*gprsize]
+    mov   c3q, [ptrq + 3*gprsize]
+    mov   c4q, [ptrq + 4*gprsize]
+    mov   c5q, [ptrq + 5*gprsize]
+    mov   c6q, [ptrq + 6*gprsize]
+    mov   c7q, [ptrq + 7*gprsize]
+    mov   c8q, [ptrq + 8*gprsize]
+
+    xor   xq, xq
+    cmp   widthq, mmsize/4
+    jl .loop2
+
+    mov   rq, widthq
+    and   rq, mmsize/4-1
+    sub   widthq, rq
+
+    mova  m6, [sobel_perm]
+.loop1:
+    movu          xm3, [c2q + xq]
+    pmovzxbd      m5, [c0q + xq]
+    vinserti32x4  ym3, [c6q + xq], 1
+    pmovzxbd      m4, [c8q + xq]
+    vinserti32x4  m2, m3, [c1q + xq], 2
+    vinserti32x4  m3, [c5q + xq], 2
+    vinserti32x4  m2, [c7q + xq], 3
+    vinserti32x4  m3, [c3q + xq], 3
+    vpermb        m2, m6, m2
+    psubd         m4, m5
+    vpermb        m3, m6, m3
+    mova          m5, m4
+    vpdpbusd      m4, m2, [sobel_mulA] {1to16}
+    vpdpbusd      m5, m3, [sobel_mulB] {1to16}
+
+    cvtdq2ps  m4, m4
+    mulps     m4, m4
+
+    cvtdq2ps    m5, m5
+    VFMADD231PS m4, m5, m5
+
+    sqrtps    m4, m4
+    fmaddps m4, m4, m0, m1
+    cvttps2dq m4, m4
+    vpmovusdb [dstq + xq], m4
+
+    add xq, mmsize/4
+    cmp xq, widthq
+    jl .loop1
+
+    add widthq, rq
+    cmp xq, widthq
+    jge .end
+
+.loop2:
+    xor  rd, rd
+    pxor m4, m4
+
+    ;Gx
+    SOBEL_MUL 0, data_n1
+    SOBEL_MUL 1, data_n2
+    SOBEL_MUL 2, data_n1
+    SOBEL_ADD 6
+    SOBEL_MUL 7, data_p2
+    SOBEL_ADD 8
+
+    cvtsi2ss xmm4, rd
+    mulss    xmm4, xmm4
+
+    xor rd, rd
+    ;Gy
+    SOBEL_MUL 0, data_n1
+    SOBEL_ADD 2
+    SOBEL_MUL 3, data_n2
+    SOBEL_MUL 5, data_p2
+    SOBEL_MUL 6, data_n1
+    SOBEL_ADD 8
+
+    cvtsi2ss  xmm5, rd
+    fmaddss xmm4, xmm5, xmm5, xmm4
+
+    sqrtps    xmm4, xmm4
+    fmaddss   xmm4, xmm4, xmm0, xmm1     ;sum = sum * rdiv + bias
+    cvttps2dq xmm4, xmm4     ; trunc to integer
+    packssdw  xmm4, xmm4
+    packuswb  xmm4, xmm4
+    movd      rd, xmm4
+    mov       [dstq + xq], rb
+
+    add xq, 1
+    cmp xq, widthq
+    jl .loop2
+.end:
+    RET
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+FILTER_SOBEL
+%endif
+%endif
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index b78a47d02b..bff10ca1a4 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -29,6 +29,11 @@  void ff_filter_3x3_sse4(uint8_t *dst, int width,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride, int size);
 
+void ff_filter_sobel_avx512icl(uint8_t *dst, int width,
+                         float scale, float delta, const int *const matrix,
+                         const uint8_t *c[], int peak, int radius,
+                         int dstride, int stride, int size);
+
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@@ -44,3 +49,16 @@  av_cold void ff_convolution_init_x86(ConvolutionContext *s)
     }
 #endif
 }
+
+av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes)
+{
+#if ARCH_X86_64
+    int cpu_flags = av_get_cpu_flags();
+    for (int i = 0; i < nb_planes; i++) {
+        if (depth == 8) {
+            if (EXTERNAL_AVX512ICL(cpu_flags))
+                s->filter[i] = ff_filter_sobel_avx512icl;
+        }
+    }
+#endif
+}
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 62d6616faf..a6f06c7007 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -46,6 +46,7 @@  AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
 AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
+AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 421bd096c5..3eb4780a64 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -197,6 +197,9 @@  static const struct {
     #if CONFIG_THRESHOLD_FILTER
         { "vf_threshold", checkasm_check_vf_threshold },
     #endif
+    #if CONFIG_SOBEL_FILTER
+        { "vf_sobel", checkasm_check_vf_sobel },
+    #endif
 #endif
 #if CONFIG_SWSCALE
     { "sw_gbrp", checkasm_check_sw_gbrp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index ee9151410e..214918e7ea 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -86,6 +86,7 @@  void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
+void checkasm_check_vf_sobel(void);
 void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
diff --git a/tests/checkasm/vf_convolution.c b/tests/checkasm/vf_convolution.c
new file mode 100644
index 0000000000..007865863e
--- /dev/null
+++ b/tests/checkasm/vf_convolution.c
@@ -0,0 +1,104 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavfilter/avfilter.h"
+#include "libavfilter/convolution.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define WIDTH 512
+#define HEIGHT 512
+#define SRC_STRIDE 512
+#define PIXELS (WIDTH * HEIGHT)
+
+#define randomize_buffers(buf, size)      \
+    do {                                  \
+        int j;                            \
+        uint8_t *tmp_buf = (uint8_t *)buf;\
+        for (j = 0; j< size; j++)         \
+            tmp_buf[j] = rnd() & 0xFF;    \
+    } while (0)
+
+static void check_sobel(const char * report_name)
+{
+    LOCAL_ALIGNED_32(uint8_t, src,     [PIXELS]);
+    LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]);
+    LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]);
+    const int height = WIDTH;
+    const int width  = HEIGHT;
+    const int stride = SRC_STRIDE;
+    const int dstride = SRC_STRIDE;
+    int mode = 0;
+    const uint8_t *c[49];
+    const int radius = 1;
+    const int bpc = 1;
+    const int step = mode == MATRIX_COLUMN ? 16 : 1;
+    const int slice_start = 0;
+    const int slice_end = height;
+    int y;
+    const int sizew = mode == MATRIX_COLUMN ? height : width;
+    float scale = 2;
+    float delta = 10;
+
+    ConvolutionContext s;
+
+    declare_func(void, uint8_t *dst, int width, float scale, float delta, const int *const matrix,
+                 const uint8_t *c[], int peak, int radius, int dstride, int stride, int size);
+
+    s.scale = scale;
+    s.delta = delta;
+    s.depth = 8;
+    s.nb_planes = 3;
+    s.planes = 15;
+    ff_sobel_init(&s, s.depth, s.nb_planes);
+
+    memset(dst_ref, 0, PIXELS);
+    memset(dst_new, 0, PIXELS);
+    randomize_buffers(src, PIXELS);
+
+    if (check_func(s.filter[0], "%s", report_name)) {
+        for (y = slice_start; y < slice_end; y += step) {
+            const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : radius * bpc;
+            const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0;
+
+            s.setup[0](radius, c, src, stride, radius, width, y, height, bpc);
+            call_ref(dst_ref + yoff + xoff, sizew - 2 * radius,
+                     scale, delta, NULL, c, 0, radius,
+                     dstride, stride, slice_end - step);
+            call_new(dst_new + yoff + xoff, sizew - 2 * radius,
+                     scale, delta, NULL, c, 0, radius,
+                     dstride, stride, slice_end - step);
+            if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff, slice_end - step))
+                fail();
+            bench_new(dst_new + yoff + xoff, sizew - 2 * radius,
+                      scale, delta, NULL, c, 0, radius,
+                      dstride, stride, slice_end - step);
+            if (mode != MATRIX_COLUMN)
+                dst_ref += dstride;
+        }
+    }
+
+}
+
+void checkasm_check_vf_sobel(void)
+{
+    check_sobel("sobel");
+    report("convolution:sobel");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index aa9b288e12..a4e95541f5 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -43,6 +43,7 @@  FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                 fate-checkasm-vf_hflip                                  \
                 fate-checkasm-vf_nlmeans                                \
                 fate-checkasm-vf_threshold                              \
+                fate-checkasm-vf_sobel                                  \
                 fate-checkasm-videodsp                                  \
                 fate-checkasm-vorbisdsp                                 \
                 fate-checkasm-vp8dsp                                    \