diff mbox series

[FFmpeg-devel,1] avcodec/x86: add cfhdenc SIMD

Message ID 20210227002723.23754-1-onemda@gmail.com
State Superseded
Headers show
Series [FFmpeg-devel,1] avcodec/x86: add cfhdenc SIMD | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Paul B Mahol Feb. 27, 2021, 12:27 a.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/cfhdencdsp.c          |   3 +
 libavcodec/x86/Makefile          |   2 +
 libavcodec/x86/cfhdencdsp.asm    | 431 +++++++++++++++++++++++++++++++
 libavcodec/x86/cfhdencdsp_init.c |  48 ++++
 4 files changed, 484 insertions(+)
 create mode 100644 libavcodec/x86/cfhdencdsp.asm
 create mode 100644 libavcodec/x86/cfhdencdsp_init.c

Comments

James Almer Feb. 27, 2021, 1:23 a.m. UTC | #1
On 2/26/2021 9:27 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>   libavcodec/cfhdencdsp.c          |   3 +
>   libavcodec/x86/Makefile          |   2 +
>   libavcodec/x86/cfhdencdsp.asm    | 431 +++++++++++++++++++++++++++++++
>   libavcodec/x86/cfhdencdsp_init.c |  48 ++++
>   4 files changed, 484 insertions(+)
>   create mode 100644 libavcodec/x86/cfhdencdsp.asm
>   create mode 100644 libavcodec/x86/cfhdencdsp_init.c
> 
> diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
> index 0becb76d1d..b979e9e09a 100644
> --- a/libavcodec/cfhdencdsp.c
> +++ b/libavcodec/cfhdencdsp.c
> @@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
>   {
>       c->horiz_filter = horiz_filter;
>       c->vert_filter = vert_filter;
> +
> +    if (ARCH_X86)
> +        ff_cfhdencdsp_init_x86(c);
>   }
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 884dc0c759..6361161180 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
>   OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
>   OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
>   OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
> +OBJS-$(CONFIG_CFHD_ENCODER)            += x86/cfhdencdsp_init.o
>   OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
>   OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
>   OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
> @@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
>   X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
>   X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
>   X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
> +X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
>   X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
>   X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
>   X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
> diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
> new file mode 100644
> index 0000000000..be51c77c46
> --- /dev/null
> +++ b/libavcodec/x86/cfhdencdsp.asm
> @@ -0,0 +1,431 @@
> +;******************************************************************************
> +;* x86-optimized functions for the CFHD encoder
> +;* Copyright (c) 2021 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
> +pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
> +pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
> +pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
> +pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
> +pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
> +pd_4:  times 4 dd  4
> +pw_n4: times 8 dw -4
> +pw_n1: times 8 dw -1

cextern pw_m1

> +cextern pw_1
> +cextern pw_4
> +
> +SECTION .text
> +
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
> +    shl  istrided, 1
> +    shl   lwidthd, 1
> +    shl   hwidthd, 1

These are ptrdiff_t, so you can use the q suffix just fine.

> +    mova       m7, [pd_4]
> +    mova       m8, [pw_1]
> +    mova       m9, [pw_n1]
> +    mova       m10,[pw_p1_n1]
> +    neg        yq

This one is int, so:

movsxdifnidn yq, yd
neg          yq

> +.looph:
> +    movsx          xq, word [inputq]
> +
> +    movsx       tempq, word [inputq + 2]
> +    add         tempq, xq
> +
> +    movd          xm0, tempd
> +    packssdw       m0, m0
> +    pextrw      tempd, xm0, 0

movd tempd, m0

There's no reason to use pextrw if you're going to read only 16 bits 
right below, then discard the value.

> +    mov   word [lowq], tempw
> +
> +    movsx          xq, word [inputq]
> +    imul           xq, 5
> +    movsx       tempq, word [inputq + 2]
> +    imul        tempq, -11
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq + 4]
> +    imul           xq, 4
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq + 6]
> +    imul           xq, 4
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq + 8]
> +    imul           xq, -1
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq + 10]
> +    imul           xq, -1
> +    add         tempq, xq
> +
> +    add         tempq, 4
> +    sar         tempq, 3
> +
> +    movd          xm0, tempd
> +    packssdw       m0, m0
> +    pextrw      tempd, xm0, 0

Same.

> +    mov  word [highq], tempw
> +
> +    mov            xq, 2
> +
> +.loopw:
> +    movu           m0, [inputq + xq * 2]
> +    movu           m1, [inputq + xq * 2 + mmsize]
> +
> +    pmaddwd        m0, m8
> +    pmaddwd        m1, m8
> +
> +    packssdw       m0, m1
> +    movu    [lowq+xq], m0
> +
> +    movu           m2, [inputq + xq * 2 - 4]
> +    movu           m3, [inputq + xq * 2 - 4 + mmsize]
> +
> +    pmaddwd        m2, m9
> +    pmaddwd        m3, m9
> +
> +    movu           m0, [inputq + xq * 2 + 4]
> +    movu           m1, [inputq + xq * 2 + 4 + mmsize]
> +
> +    pmaddwd        m0, m8
> +    pmaddwd        m1, m8
> +
> +    paddd          m0, m2
> +    paddd          m1, m3
> +
> +    paddd          m0, m7
> +    paddd          m1, m7
> +
> +    psrad          m0, 3
> +    psrad          m1, 3
> +
> +    movu           m5, [inputq + xq * 2 + 0]
> +    movu           m6, [inputq + xq * 2 + mmsize]
> +
> +    pmaddwd        m5, m10
> +    pmaddwd        m6, m10
> +
> +    paddd          m0, m5
> +    paddd          m1, m6
> +
> +    packssdw       m0, m1
> +    movu   [highq+xq], m0
> +
> +    add            xq, mmsize
> +    cmp            xq, widthq

width is an int, so use movsxdifnidn on it alongside y.

> +    jl .loopw
> +
> +    add          lowq, widthq
> +    add         highq, widthq
> +    add        inputq, widthq
> +    add        inputq, widthq

Twice? If this is correct, you could use lea.

> +
> +    movsx          xq, word [inputq - 4]
> +    movsx       tempq, word [inputq - 2]
> +    add         tempq, xq
> +
> +    movd          xm0, tempd
> +    packssdw       m0, m0
> +    pextrw      tempd, xm0, 0

also movd.

> +    mov word [lowq-2], tempw
> +
> +    movsx       tempq, word [inputq - 4]
> +    imul        tempq, 11
> +    movsx          xq, word [inputq - 2]
> +    imul           xq, -5
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq - 6]
> +    imul           xq, -4
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq - 8]
> +    imul           xq, -4
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq - 10]
> +    add         tempq, xq
> +
> +    movsx          xq, word [inputq - 12]
> +    add         tempq, xq
> +
> +    add         tempq, 4
> +    sar         tempq, 3
> +
> +    movd          xm0, tempd
> +    packssdw       m0, m0
> +    pextrw      tempd, xm0, 0

Same.

> +    mov word [highq-2], tempw
> +
> +    sub        inputq, widthq
> +    sub        inputq, widthq
> +    sub         highq, widthq
> +    sub          lowq, widthq
> +
> +    add          lowq, lwidthq
> +    add         highq, hwidthq
> +    add        inputq, istrideq
> +    add            yq, 1
> +    jl .looph
> +
> +    RET
> +%endif
> +
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
> +    shl  istrided, 1
> +    shl    widthd, 1
> +
> +    sub   heightq, 2

heightd

> +
> +    xor        xq, xq
> +
> +    mova       m7, [pd_4]
> +    mova       m8, [pw_1]
> +    mova       m9, [pw_n1]
> +    mova       m10,[pw_p1_n1]
> +    mova       m11,[pw_n1_p1]
> +    mova       m12,[pw_4]
> +    mova       m13,[pw_n4]
> +.loopw:
> +    mov        yq, 2
> +
> +    mov      posq, xq
> +    movu       m0, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m1, [inputq + posq]
> +
> +    paddsw     m0, m1
> +
> +    movu    [lowq + xq], m0
> +
> +    mov      posq, xq
> +
> +    movu       m0, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m1, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m2, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m3, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m4, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m5, [inputq + posq]
> +
> +    mova       m6, m0
> +    punpcklwd  m0, m1
> +    punpckhwd  m1, m6
> +
> +    mova       m6, m2
> +    punpcklwd  m2, m3
> +    punpckhwd  m3, m6
> +
> +    mova       m6, m4
> +    punpcklwd  m4, m5
> +    punpckhwd  m5, m6
> +
> +    pmaddwd    m0, [pw_p5_n11]
> +    pmaddwd    m1, [pw_n11_p5]
> +    pmaddwd    m2, m12
> +    pmaddwd    m3, m12
> +    pmaddwd    m4, m9
> +    pmaddwd    m5, m9
> +
> +    paddd      m0, m2
> +    paddd      m1, m3
> +    paddd      m0, m4
> +    paddd      m1, m5
> +
> +    paddd      m0, m7
> +    paddd      m1, m7
> +
> +    psrad      m0, 3
> +    psrad      m1, 3
> +    packssdw   m0, m1
> +
> +    movu   [highq + xq], m0
> +
> +.looph:
> +
> +    mov      posq, istrideq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu       m0, [inputq + posq]
> +
> +    add      posq, istrideq
> +    movu       m1, [inputq + posq]
> +
> +    paddsw     m0, m1
> +
> +    mov      posq, lwidthq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu    [lowq + posq], m0
> +
> +    add        yq, -2
> +
> +    mov      posq, istrideq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu       m0, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m1, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m2, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m3, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m4, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m5, [inputq + posq]
> +
> +    add        yq, 2
> +
> +    mova       m6, m0
> +    punpcklwd  m0, m1
> +    punpckhwd  m1, m6
> +
> +    mova       m6, m2
> +    punpcklwd  m2, m3
> +    punpckhwd  m3, m6
> +
> +    mova       m6, m4
> +    punpcklwd  m4, m5
> +    punpckhwd  m5, m6
> +
> +    pmaddwd    m0, m9
> +    pmaddwd    m1, m9
> +    pmaddwd    m2, m10
> +    pmaddwd    m3, m11
> +    pmaddwd    m4, m8
> +    pmaddwd    m5, m8
> +
> +    paddd      m0, m4
> +    paddd      m1, m5
> +
> +    paddd      m0, m7
> +    paddd      m1, m7
> +
> +    psrad      m0, 3
> +    psrad      m1, 3
> +    paddd      m0, m2
> +    paddd      m1, m3
> +    packssdw   m0, m1
> +
> +    mov      posq, hwidthq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu   [highq + posq], m0
> +
> +    add        yq, 2
> +    cmp        yq, heightq
> +    jl .looph
> +
> +    mov      posq, istrideq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu       m0, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m1, [inputq + posq]
> +
> +    paddsw     m0, m1
> +
> +    mov      posq, lwidthq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu    [lowq + posq], m0
> +
> +    sub        yq, 4
> +
> +    mov      posq, istrideq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu       m0, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m1, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m2, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m3, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m4, [inputq + posq]
> +    add      posq, istrideq
> +    movu       m5, [inputq + posq]
> +
> +    add        yq, 4
> +
> +    mova       m6, m0
> +    punpcklwd  m0, m1
> +    punpckhwd  m1, m6
> +
> +    mova       m6, m2
> +    punpcklwd  m2, m3
> +    punpckhwd  m3, m6
> +
> +    mova       m6, m4
> +    punpcklwd  m4, m5
> +    punpckhwd  m5, m6
> +
> +    pmaddwd    m0, m8
> +    pmaddwd    m1, m8
> +    pmaddwd    m2, m13
> +    pmaddwd    m3, m13
> +    pmaddwd    m4, [pw_p11_n5]
> +    pmaddwd    m5, [pw_n5_p11]
> +
> +    paddd      m4, m2
> +    paddd      m5, m3
> +
> +    paddd      m4, m0
> +    paddd      m5, m1
> +
> +    paddd      m4, m7
> +    paddd      m5, m7
> +
> +    psrad      m4, 3
> +    psrad      m5, 3
> +    packssdw   m4, m5
> +
> +    mov      posq, hwidthq
> +    imul     posq, yq
> +    add      posq, xq
> +
> +    movu   [highq + posq], m4
> +
> +    add        xq, mmsize
> +    cmp        xq, widthq
> +    jl .loopw
> +    RET
> +%endif
> diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c
> new file mode 100644
> index 0000000000..28f1dd504d
> --- /dev/null
> +++ b/libavcodec/x86/cfhdencdsp_init.c
> @@ -0,0 +1,48 @@
> +/*
> + * Copyright (c) 2021 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/cfhdencdsp.h"
> +
> +void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
> +                                  ptrdiff_t in_stride, ptrdiff_t low_stride,
> +                                  ptrdiff_t high_stride,
> +                                  int width, int height);
> +void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
> +                                 ptrdiff_t in_stride, ptrdiff_t low_stride,
> +                                 ptrdiff_t high_stride,
> +                                 int width, int height);
> +
> +av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +#if ARCH_X86_64
> +    if (EXTERNAL_SSE2(cpu_flags)) {
> +        c->horiz_filter = ff_cfhdenc_horiz_filter_sse2;
> +        c->vert_filter = ff_cfhdenc_vert_filter_sse2;
> +    }
> +#endif
> +}
>
diff mbox series

Patch

diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
index 0becb76d1d..b979e9e09a 100644
--- a/libavcodec/cfhdencdsp.c
+++ b/libavcodec/cfhdencdsp.c
@@ -73,4 +73,7 @@  av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
 {
     c->horiz_filter = horiz_filter;
     c->vert_filter = vert_filter;
+
+    if (ARCH_X86)
+        ff_cfhdencdsp_init_x86(c);
 }
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 884dc0c759..6361161180 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -51,6 +51,7 @@  OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
+OBJS-$(CONFIG_CFHD_ENCODER)            += x86/cfhdencdsp_init.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
@@ -154,6 +155,7 @@  X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
 X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
 X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
new file mode 100644
index 0000000000..be51c77c46
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -0,0 +1,431 @@ 
+;******************************************************************************
+;* x86-optimized functions for the CFHD encoder
+;* Copyright (c) 2021 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
+pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
+pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
+pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
+pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
+pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
+pd_4:  times 4 dd  4
+pw_n4: times 8 dw -4
+pw_n1: times 8 dw -1
+cextern pw_1
+cextern pw_4
+
+SECTION .text
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
+    shl  istrided, 1
+    shl   lwidthd, 1
+    shl   hwidthd, 1
+    mova       m7, [pd_4]
+    mova       m8, [pw_1]
+    mova       m9, [pw_n1]
+    mova       m10,[pw_p1_n1]
+    neg        yq
+.looph:
+    movsx          xq, word [inputq]
+
+    movsx       tempq, word [inputq + 2]
+    add         tempq, xq
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov   word [lowq], tempw
+
+    movsx          xq, word [inputq]
+    imul           xq, 5
+    movsx       tempq, word [inputq + 2]
+    imul        tempq, -11
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 4]
+    imul           xq, 4
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 6]
+    imul           xq, 4
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 8]
+    imul           xq, -1
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 10]
+    imul           xq, -1
+    add         tempq, xq
+
+    add         tempq, 4
+    sar         tempq, 3
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov  word [highq], tempw
+
+    mov            xq, 2
+
+.loopw:
+    movu           m0, [inputq + xq * 2]
+    movu           m1, [inputq + xq * 2 + mmsize]
+
+    pmaddwd        m0, m8
+    pmaddwd        m1, m8
+
+    packssdw       m0, m1
+    movu    [lowq+xq], m0
+
+    movu           m2, [inputq + xq * 2 - 4]
+    movu           m3, [inputq + xq * 2 - 4 + mmsize]
+
+    pmaddwd        m2, m9
+    pmaddwd        m3, m9
+
+    movu           m0, [inputq + xq * 2 + 4]
+    movu           m1, [inputq + xq * 2 + 4 + mmsize]
+
+    pmaddwd        m0, m8
+    pmaddwd        m1, m8
+
+    paddd          m0, m2
+    paddd          m1, m3
+
+    paddd          m0, m7
+    paddd          m1, m7
+
+    psrad          m0, 3
+    psrad          m1, 3
+
+    movu           m5, [inputq + xq * 2 + 0]
+    movu           m6, [inputq + xq * 2 + mmsize]
+
+    pmaddwd        m5, m10
+    pmaddwd        m6, m10
+
+    paddd          m0, m5
+    paddd          m1, m6
+
+    packssdw       m0, m1
+    movu   [highq+xq], m0
+
+    add            xq, mmsize
+    cmp            xq, widthq
+    jl .loopw
+
+    add          lowq, widthq
+    add         highq, widthq
+    add        inputq, widthq
+    add        inputq, widthq
+
+    movsx          xq, word [inputq - 4]
+    movsx       tempq, word [inputq - 2]
+    add         tempq, xq
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov word [lowq-2], tempw
+
+    movsx       tempq, word [inputq - 4]
+    imul        tempq, 11
+    movsx          xq, word [inputq - 2]
+    imul           xq, -5
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 6]
+    imul           xq, -4
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 8]
+    imul           xq, -4
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 10]
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 12]
+    add         tempq, xq
+
+    add         tempq, 4
+    sar         tempq, 3
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov word [highq-2], tempw
+
+    sub        inputq, widthq
+    sub        inputq, widthq
+    sub         highq, widthq
+    sub          lowq, widthq
+
+    add          lowq, lwidthq
+    add         highq, hwidthq
+    add        inputq, istrideq
+    add            yq, 1
+    jl .looph
+
+    RET
+%endif
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
+    shl  istrided, 1
+    shl    widthd, 1
+
+    sub   heightq, 2
+
+    xor        xq, xq
+
+    mova       m7, [pd_4]
+    mova       m8, [pw_1]
+    mova       m9, [pw_n1]
+    mova       m10,[pw_p1_n1]
+    mova       m11,[pw_n1_p1]
+    mova       m12,[pw_4]
+    mova       m13,[pw_n4]
+.loopw:
+    mov        yq, 2
+
+    mov      posq, xq
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    movu    [lowq + xq], m0
+
+    mov      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, [pw_p5_n11]
+    pmaddwd    m1, [pw_n11_p5]
+    pmaddwd    m2, m12
+    pmaddwd    m3, m12
+    pmaddwd    m4, m9
+    pmaddwd    m5, m9
+
+    paddd      m0, m2
+    paddd      m1, m3
+    paddd      m0, m4
+    paddd      m1, m5
+
+    paddd      m0, m7
+    paddd      m1, m7
+
+    psrad      m0, 3
+    psrad      m1, 3
+    packssdw   m0, m1
+
+    movu   [highq + xq], m0
+
+.looph:
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [lowq + posq], m0
+
+    add        yq, -2
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    add        yq, 2
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, m9
+    pmaddwd    m1, m9
+    pmaddwd    m2, m10
+    pmaddwd    m3, m11
+    pmaddwd    m4, m8
+    pmaddwd    m5, m8
+
+    paddd      m0, m4
+    paddd      m1, m5
+
+    paddd      m0, m7
+    paddd      m1, m7
+
+    psrad      m0, 3
+    psrad      m1, 3
+    paddd      m0, m2
+    paddd      m1, m3
+    packssdw   m0, m1
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu   [highq + posq], m0
+
+    add        yq, 2
+    cmp        yq, heightq
+    jl .looph
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [lowq + posq], m0
+
+    sub        yq, 4
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    add        yq, 4
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, m8
+    pmaddwd    m1, m8
+    pmaddwd    m2, m13
+    pmaddwd    m3, m13
+    pmaddwd    m4, [pw_p11_n5]
+    pmaddwd    m5, [pw_n5_p11]
+
+    paddd      m4, m2
+    paddd      m5, m3
+
+    paddd      m4, m0
+    paddd      m5, m1
+
+    paddd      m4, m7
+    paddd      m5, m7
+
+    psrad      m4, 3
+    psrad      m5, 3
+    packssdw   m4, m5
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu   [highq + posq], m4
+
+    add        xq, mmsize
+    cmp        xq, widthq
+    jl .loopw
+    RET
+%endif
diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c
new file mode 100644
index 0000000000..28f1dd504d
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp_init.c
@@ -0,0 +1,48 @@ 
+/*
+ * Copyright (c) 2021 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/cfhdencdsp.h"
+
+void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+                                  ptrdiff_t in_stride, ptrdiff_t low_stride,
+                                  ptrdiff_t high_stride,
+                                  int width, int height);
+void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+                                 ptrdiff_t in_stride, ptrdiff_t low_stride,
+                                 ptrdiff_t high_stride,
+                                 int width, int height);
+
+av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_64
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->horiz_filter = ff_cfhdenc_horiz_filter_sse2;
+        c->vert_filter = ff_cfhdenc_vert_filter_sse2;
+    }
+#endif
+}