diff mbox series

[FFmpeg-devel] avcodec/x86: add cfhdenc SIMD

Message ID 20210227140544.8212-1-onemda@gmail.com
State Accepted
Headers show
Series [FFmpeg-devel] avcodec/x86: add cfhdenc SIMD
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Paul B Mahol Feb. 27, 2021, 2:05 p.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/cfhdencdsp.c          |   3 +
 libavcodec/x86/Makefile          |   2 +
 libavcodec/x86/cfhdencdsp.asm    | 435 +++++++++++++++++++++++++++++++
 libavcodec/x86/cfhdencdsp_init.c |  48 ++++
 4 files changed, 488 insertions(+)
 create mode 100644 libavcodec/x86/cfhdencdsp.asm
 create mode 100644 libavcodec/x86/cfhdencdsp_init.c

Comments

James Almer Feb. 27, 2021, 2:32 p.m. UTC | #1
On 2/27/2021 11:05 AM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>   libavcodec/cfhdencdsp.c          |   3 +
>   libavcodec/x86/Makefile          |   2 +
>   libavcodec/x86/cfhdencdsp.asm    | 435 +++++++++++++++++++++++++++++++
>   libavcodec/x86/cfhdencdsp_init.c |  48 ++++
>   4 files changed, 488 insertions(+)
>   create mode 100644 libavcodec/x86/cfhdencdsp.asm
>   create mode 100644 libavcodec/x86/cfhdencdsp_init.c
> 
> diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
> index 0becb76d1d..b979e9e09a 100644
> --- a/libavcodec/cfhdencdsp.c
> +++ b/libavcodec/cfhdencdsp.c
> @@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
>   {
>       c->horiz_filter = horiz_filter;
>       c->vert_filter = vert_filter;
> +
> +    if (ARCH_X86)
> +        ff_cfhdencdsp_init_x86(c);
>   }
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 884dc0c759..6361161180 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
>   OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
>   OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
>   OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
> +OBJS-$(CONFIG_CFHD_ENCODER)            += x86/cfhdencdsp_init.o
>   OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
>   OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
>   OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
> @@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
>   X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
>   X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
>   X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
> +X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
>   X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
>   X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
>   X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
> diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
> new file mode 100644
> index 0000000000..b0b094aa71
> --- /dev/null
> +++ b/libavcodec/x86/cfhdencdsp.asm
> @@ -0,0 +1,435 @@
> +;******************************************************************************
> +;* x86-optimized functions for the CFHD encoder
> +;* Copyright (c) 2021 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
> +pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
> +pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
> +pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
> +pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
> +pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
> +pd_4:  times 4 dd  4
> +pw_n4: times 8 dw -4
> +cextern pw_m1
> +cextern pw_1
> +cextern pw_4
> +
> +SECTION .text

[...]

> +
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
> +    movsxdifnidn  widthq, widthd
> +    movsxdifnidn heightq, heightd

Why did you add this? The shl and sub below using a d suffix like in the 
previous version is enough to clear the upper bits.

> +
> +    shl  istrideq, 1
> +
> +    shl    widthq, 1
> +    sub   heightq, 2

Should be ok if tested and bitexact.
Paul B Mahol Feb. 27, 2021, 2:38 p.m. UTC | #2
On Sat, Feb 27, 2021 at 3:33 PM James Almer <jamrial@gmail.com> wrote:

> On 2/27/2021 11:05 AM, Paul B Mahol wrote:
> > Signed-off-by: Paul B Mahol <onemda@gmail.com>
> > ---
> >   libavcodec/cfhdencdsp.c          |   3 +
> >   libavcodec/x86/Makefile          |   2 +
> >   libavcodec/x86/cfhdencdsp.asm    | 435 +++++++++++++++++++++++++++++++
> >   libavcodec/x86/cfhdencdsp_init.c |  48 ++++
> >   4 files changed, 488 insertions(+)
> >   create mode 100644 libavcodec/x86/cfhdencdsp.asm
> >   create mode 100644 libavcodec/x86/cfhdencdsp_init.c
> >
> > diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
> > index 0becb76d1d..b979e9e09a 100644
> > --- a/libavcodec/cfhdencdsp.c
> > +++ b/libavcodec/cfhdencdsp.c
> > @@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
> >   {
> >       c->horiz_filter = horiz_filter;
> >       c->vert_filter = vert_filter;
> > +
> > +    if (ARCH_X86)
> > +        ff_cfhdencdsp_init_x86(c);
> >   }
> > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> > index 884dc0c759..6361161180 100644
> > --- a/libavcodec/x86/Makefile
> > +++ b/libavcodec/x86/Makefile
> > @@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER)            +=
> x86/alacdsp_init.o
> >   OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
> >   OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
> >   OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
> > +OBJS-$(CONFIG_CFHD_ENCODER)            += x86/cfhdencdsp_init.o
> >   OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
> x86/synth_filter_init.o
> >   OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
> >   OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
> > @@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) +=
> x86/g722dsp.o
> >   X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
> >   X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
> >   X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
> > +X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
> >   X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
> >   X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o
> x86/synth_filter.o
> >   X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o
>   \
> > diff --git a/libavcodec/x86/cfhdencdsp.asm
> b/libavcodec/x86/cfhdencdsp.asm
> > new file mode 100644
> > index 0000000000..b0b094aa71
> > --- /dev/null
> > +++ b/libavcodec/x86/cfhdencdsp.asm
> > @@ -0,0 +1,435 @@
> >
> +;******************************************************************************
> > +;* x86-optimized functions for the CFHD encoder
> > +;* Copyright (c) 2021 Paul B Mahol
> > +;*
> > +;* This file is part of FFmpeg.
> > +;*
> > +;* FFmpeg is free software; you can redistribute it and/or
> > +;* modify it under the terms of the GNU Lesser General Public
> > +;* License as published by the Free Software Foundation; either
> > +;* version 2.1 of the License, or (at your option) any later version.
> > +;*
> > +;* FFmpeg is distributed in the hope that it will be useful,
> > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +;* Lesser General Public License for more details.
> > +;*
> > +;* You should have received a copy of the GNU Lesser General Public
> > +;* License along with FFmpeg; if not, write to the Free Software
> > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> >
> +;******************************************************************************
> > +
> > +%include "libavutil/x86/x86util.asm"
> > +
> > +SECTION_RODATA
> > +
> > +pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
> > +pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
> > +pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
> > +pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
> > +pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
> > +pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
> > +pd_4:  times 4 dd  4
> > +pw_n4: times 8 dw -4
> > +cextern pw_m1
> > +cextern pw_1
> > +cextern pw_4
> > +
> > +SECTION .text
>
> [...]
>
> > +
> > +%if ARCH_X86_64
> > +INIT_XMM sse2
> > +cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride,
> lwidth, hwidth, width, height, x, y, pos
> > +    movsxdifnidn  widthq, widthd
> > +    movsxdifnidn heightq, heightd
>
> Why did you add this? The shl and sub below using a d suffix like in the
> previous version is enough to clear the upper bits.
>

ok


>
> > +
> > +    shl  istrideq, 1
> > +
> > +    shl    widthq, 1
> > +    sub   heightq, 2
>
> Should be ok if tested and bitexact.
>

yes. it is bitexact.
diff mbox series

Patch

diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
index 0becb76d1d..b979e9e09a 100644
--- a/libavcodec/cfhdencdsp.c
+++ b/libavcodec/cfhdencdsp.c
@@ -73,4 +73,7 @@  av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
 {
     c->horiz_filter = horiz_filter;
     c->vert_filter = vert_filter;
+
+    if (ARCH_X86)
+        ff_cfhdencdsp_init_x86(c);
 }
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 884dc0c759..6361161180 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -51,6 +51,7 @@  OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
+OBJS-$(CONFIG_CFHD_ENCODER)            += x86/cfhdencdsp_init.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
@@ -154,6 +155,7 @@  X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
 X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
 X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
new file mode 100644
index 0000000000..b0b094aa71
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -0,0 +1,435 @@ 
+;******************************************************************************
+;* x86-optimized functions for the CFHD encoder
+;* Copyright (c) 2021 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
+pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
+pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
+pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
+pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
+pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
+pd_4:  times 4 dd  4
+pw_n4: times 8 dw -4
+cextern pw_m1
+cextern pw_1
+cextern pw_4
+
+SECTION .text
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
+    shl  istrideq, 1
+    shl   lwidthq, 1
+    shl   hwidthq, 1
+    mova       m7, [pd_4]
+    mova       m8, [pw_1]
+    mova       m9, [pw_m1]
+    mova       m10,[pw_p1_n1]
+    movsxdifnidn yq, yd
+    movsxdifnidn widthq, widthd
+    neg        yq
+.looph:
+    movsx          xq, word [inputq]
+
+    movsx       tempq, word [inputq + 2]
+    add         tempq, xq
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    movd        tempd, m0
+    mov   word [lowq], tempw
+
+    movsx          xq, word [inputq]
+    imul           xq, 5
+    movsx       tempq, word [inputq + 2]
+    imul        tempq, -11
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 4]
+    imul           xq, 4
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 6]
+    imul           xq, 4
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 8]
+    imul           xq, -1
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 10]
+    imul           xq, -1
+    add         tempq, xq
+
+    add         tempq, 4
+    sar         tempq, 3
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    movd        tempd, m0
+    mov  word [highq], tempw
+
+    mov            xq, 2
+
+.loopw:
+    movu           m0, [inputq + xq * 2]
+    movu           m1, [inputq + xq * 2 + mmsize]
+
+    pmaddwd        m0, m8
+    pmaddwd        m1, m8
+
+    packssdw       m0, m1
+    movu    [lowq+xq], m0
+
+    movu           m2, [inputq + xq * 2 - 4]
+    movu           m3, [inputq + xq * 2 - 4 + mmsize]
+
+    pmaddwd        m2, m9
+    pmaddwd        m3, m9
+
+    movu           m0, [inputq + xq * 2 + 4]
+    movu           m1, [inputq + xq * 2 + 4 + mmsize]
+
+    pmaddwd        m0, m8
+    pmaddwd        m1, m8
+
+    paddd          m0, m2
+    paddd          m1, m3
+
+    paddd          m0, m7
+    paddd          m1, m7
+
+    psrad          m0, 3
+    psrad          m1, 3
+
+    movu           m5, [inputq + xq * 2 + 0]
+    movu           m6, [inputq + xq * 2 + mmsize]
+
+    pmaddwd        m5, m10
+    pmaddwd        m6, m10
+
+    paddd          m0, m5
+    paddd          m1, m6
+
+    packssdw       m0, m1
+    movu   [highq+xq], m0
+
+    add            xq, mmsize
+    cmp            xq, widthq
+    jl .loopw
+
+    add          lowq, widthq
+    add         highq, widthq
+    lea        inputq, [inputq + widthq * 2]
+
+    movsx          xq, word [inputq - 4]
+    movsx       tempq, word [inputq - 2]
+    add         tempq, xq
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    movd        tempd, m0
+    mov word [lowq-2], tempw
+
+    movsx       tempq, word [inputq - 4]
+    imul        tempq, 11
+    movsx          xq, word [inputq - 2]
+    imul           xq, -5
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 6]
+    imul           xq, -4
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 8]
+    imul           xq, -4
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 10]
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 12]
+    add         tempq, xq
+
+    add         tempq, 4
+    sar         tempq, 3
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    movd        tempd, m0
+    mov word [highq-2], tempw
+
+    sub        inputq, widthq
+    sub        inputq, widthq
+    sub         highq, widthq
+    sub          lowq, widthq
+
+    add          lowq, lwidthq
+    add         highq, hwidthq
+    add        inputq, istrideq
+    add            yq, 1
+    jl .looph
+
+    RET
+%endif
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
+    movsxdifnidn  widthq, widthd
+    movsxdifnidn heightq, heightd
+
+    shl  istrideq, 1
+
+    shl    widthq, 1
+    sub   heightq, 2
+
+    xor        xq, xq
+
+    mova       m7, [pd_4]
+    mova       m8, [pw_1]
+    mova       m9, [pw_m1]
+    mova       m10,[pw_p1_n1]
+    mova       m11,[pw_n1_p1]
+    mova       m12,[pw_4]
+    mova       m13,[pw_n4]
+.loopw:
+    mov        yq, 2
+
+    mov      posq, xq
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    movu    [lowq + xq], m0
+
+    mov      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, [pw_p5_n11]
+    pmaddwd    m1, [pw_n11_p5]
+    pmaddwd    m2, m12
+    pmaddwd    m3, m12
+    pmaddwd    m4, m9
+    pmaddwd    m5, m9
+
+    paddd      m0, m2
+    paddd      m1, m3
+    paddd      m0, m4
+    paddd      m1, m5
+
+    paddd      m0, m7
+    paddd      m1, m7
+
+    psrad      m0, 3
+    psrad      m1, 3
+    packssdw   m0, m1
+
+    movu   [highq + xq], m0
+
+.looph:
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [lowq + posq], m0
+
+    add        yq, -2
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    add        yq, 2
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, m9
+    pmaddwd    m1, m9
+    pmaddwd    m2, m10
+    pmaddwd    m3, m11
+    pmaddwd    m4, m8
+    pmaddwd    m5, m8
+
+    paddd      m0, m4
+    paddd      m1, m5
+
+    paddd      m0, m7
+    paddd      m1, m7
+
+    psrad      m0, 3
+    psrad      m1, 3
+    paddd      m0, m2
+    paddd      m1, m3
+    packssdw   m0, m1
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu   [highq + posq], m0
+
+    add        yq, 2
+    cmp        yq, heightq
+    jl .looph
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [lowq + posq], m0
+
+    sub        yq, 4
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    add        yq, 4
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, m8
+    pmaddwd    m1, m8
+    pmaddwd    m2, m13
+    pmaddwd    m3, m13
+    pmaddwd    m4, [pw_p11_n5]
+    pmaddwd    m5, [pw_n5_p11]
+
+    paddd      m4, m2
+    paddd      m5, m3
+
+    paddd      m4, m0
+    paddd      m5, m1
+
+    paddd      m4, m7
+    paddd      m5, m7
+
+    psrad      m4, 3
+    psrad      m5, 3
+    packssdw   m4, m5
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu   [highq + posq], m4
+
+    add        xq, mmsize
+    cmp        xq, widthq
+    jl .loopw
+    RET
+%endif
diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c
new file mode 100644
index 0000000000..28f1dd504d
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp_init.c
@@ -0,0 +1,48 @@ 
+/*
+ * Copyright (c) 2021 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/cfhdencdsp.h"
+
+void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+                                  ptrdiff_t in_stride, ptrdiff_t low_stride,
+                                  ptrdiff_t high_stride,
+                                  int width, int height);
+void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+                                 ptrdiff_t in_stride, ptrdiff_t low_stride,
+                                 ptrdiff_t high_stride,
+                                 int width, int height);
+
+av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_64
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->horiz_filter = ff_cfhdenc_horiz_filter_sse2;
+        c->vert_filter = ff_cfhdenc_vert_filter_sse2;
+    }
+#endif
+}