diff mbox series

[FFmpeg-devel] avcodec/cfhd: add x86 SIMD

Message ID 20200821111329.3466-1-onemda@gmail.com
State Accepted
Headers show
Series [FFmpeg-devel] avcodec/cfhd: add x86 SIMD | expand

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Paul B Mahol Aug. 21, 2020, 11:13 a.m. UTC
Overall speed changes for 1920x1080, yuv422p10le, 60fps from: 0.19x to 0.343x

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/Makefile           |   2 +-
 libavcodec/cfhd.c             | 337 ++++--------------
 libavcodec/cfhd.h             |   3 +
 libavcodec/cfhddsp.c          | 118 +++++++
 libavcodec/cfhddsp.h          |  44 +++
 libavcodec/x86/Makefile       |   2 +
 libavcodec/x86/cfhddsp.asm    | 619 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/cfhddsp_init.c |  52 +++
 8 files changed, 903 insertions(+), 274 deletions(-)
 create mode 100644 libavcodec/cfhddsp.c
 create mode 100644 libavcodec/cfhddsp.h
 create mode 100644 libavcodec/x86/cfhddsp.asm
 create mode 100644 libavcodec/x86/cfhddsp_init.c

Comments

Moritz Barsnick Aug. 21, 2020, 1:10 p.m. UTC | #1
On Fri, Aug 21, 2020 at 13:13:29 +0200, Paul B Mahol wrote:
> Overall speed changes for 1920x1080, yuv422p10le, 60fps from: 0.19x to 0.343x

Let me add my promised benchmarks. I have nothing modern here (these
CPUs are 10, 18 and 6 years old), but I can confirm the impressive
improvements:

                          Intel Atom D525   Intel Pentium 4  Intel Haswell i5-4200U

MT_BeartoothHighw....avi: 0.489x -> 0.835x  0.23x -> 0.55x   2.13x  -> 3.07x
bigger_res.mov:           0.057x -> 0.069x  0.02x -> 0.058x  0.174x -> 0.237x
timelapse-pt-momsapt.avi: 1.11x  -> 1.9x    0.53x -> 1.37x   4.3x   -> 6.2x

No more crashes with this patch, on any of the three platforms.

Cheers,
Moritz

P.S.: Both timelapse-pt-momsapt.avi and MT_BeartoothHighway_1min_Cineform.avi
end with corrupt AVI packets, resulting in
[cfhd @ 0xb2428c0] Escape codeword not found, probably corrupt data

Unrelated, certainly.
Moritz Barsnick Aug. 21, 2020, 1:40 p.m. UTC | #2
On Fri, Aug 21, 2020 at 15:10:09 +0200, Moritz Barsnick wrote:
> Let me add my promised benchmarks. I have nothing modern here (these
> CPUs are 10, 18 and 6 years old), but I can confirm the impressive
> improvements:
>
>                           Intel Atom D525   Intel Pentium 4  Intel Haswell i5-4200U
>
> MT_BeartoothHighw....avi: 0.489x -> 0.835x  0.23x -> 0.55x   2.13x  -> 3.07x
> bigger_res.mov:           0.057x -> 0.069x  0.02x -> 0.058x  0.174x -> 0.237x
> timelapse-pt-momsapt.avi: 1.11x  -> 1.9x    0.53x -> 1.37x   4.3x   -> 6.2x

This is a more modern one, very skewed by running in a VM restricted to
2 CPUs and 95% of the host CPU (yeah, shouldn't do that). But at least
the speed-up is shown.

                          Intel Kaby Lake i7-7500U

MT_BeartoothHighw....avi: 2.0x  -> 3.4x
bigger_res.mov:           0.21x -> 0.27x
timelapse-pt-momsapt.avi: 3.5x  -> 6.6x

Moritz
Paul B Mahol Aug. 23, 2020, 11:28 a.m. UTC | #3
On 8/21/20, Paul B Mahol <onemda@gmail.com> wrote:
> Overall speed changes for 1920x1080, yuv422p10le, 60fps from: 0.19x to
> 0.343x
>
> Signed-off-by: Paul B Mahol <onemda@gmail.com>

Will apply soon.
diff mbox series

Patch

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 3431ba2dca..a770198475 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -255,7 +255,7 @@  OBJS-$(CONFIG_CCAPTION_DECODER)        += ccaption_dec.o ass.o
 OBJS-$(CONFIG_CDGRAPHICS_DECODER)      += cdgraphics.o
 OBJS-$(CONFIG_CDTOONS_DECODER)         += cdtoons.o
 OBJS-$(CONFIG_CDXL_DECODER)            += cdxl.o
-OBJS-$(CONFIG_CFHD_DECODER)            += cfhd.o cfhddata.o
+OBJS-$(CONFIG_CFHD_DECODER)            += cfhd.o cfhddata.o cfhddsp.o
 OBJS-$(CONFIG_CFHD_ENCODER)            += cfhdenc.o cfhddata.o
 OBJS-$(CONFIG_CINEPAK_DECODER)         += cinepak.o
 OBJS-$(CONFIG_CINEPAK_ENCODER)         += cinepakenc.o elbg.o
diff --git a/libavcodec/cfhd.c b/libavcodec/cfhd.c
index 2b1db0ed8d..291d53e02e 100644
--- a/libavcodec/cfhd.c
+++ b/libavcodec/cfhd.c
@@ -190,47 +190,6 @@  static inline void process_bayer(AVFrame *frame, int bpc)
     }
 }
 
-static inline void filter(int16_t *output, ptrdiff_t out_stride,
-                          int16_t *low, ptrdiff_t low_stride,
-                          int16_t *high, ptrdiff_t high_stride,
-                          int len, int clip)
-{
-    int16_t tmp;
-    int i;
-
-    tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3;
-    output[(2*0+0)*out_stride] = (tmp + high[0*high_stride]) >> 1;
-    if (clip)
-        output[(2*0+0)*out_stride] = av_clip_uintp2_c(output[(2*0+0)*out_stride], clip);
-
-    tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3;
-    output[(2*0+1)*out_stride] = (tmp - high[0*high_stride]) >> 1;
-    if (clip)
-        output[(2*0+1)*out_stride] = av_clip_uintp2_c(output[(2*0+1)*out_stride], clip);
-
-    for (i = 1; i < len - 1; i++) {
-        tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3;
-        output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1;
-        if (clip)
-            output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
-
-        tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3;
-        output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1;
-        if (clip)
-            output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
-    }
-
-    tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3;
-    output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1;
-    if (clip)
-        output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
-
-    tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3;
-    output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1;
-    if (clip)
-        output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
-}
-
 static inline void interlaced_vertical_filter(int16_t *output, int16_t *low, int16_t *high,
                          int width, int linesize, int plane)
 {
@@ -244,8 +203,7 @@  static inline void interlaced_vertical_filter(int16_t *output, int16_t *low, int
     }
 }
 
-static inline void inverse_temporal_filter(int16_t *output, int16_t *low, int16_t *high,
-                                           int width)
+static inline void inverse_temporal_filter(int16_t *low, int16_t *high, int width)
 {
     for (int i = 0; i < width; i++) {
         int even = (low[i] - high[i]) / 2;
@@ -256,31 +214,6 @@  static inline void inverse_temporal_filter(int16_t *output, int16_t *low, int16_
     }
 }
 
-static void horiz_filter(int16_t *output, int16_t *low, int16_t *high,
-                         int width)
-{
-    filter(output, 1, low, 1, high, 1, width, 0);
-}
-
-static void horiz_filter_clip(int16_t *output, int16_t *low, int16_t *high,
-                              int width, int clip)
-{
-    filter(output, 1, low, 1, high, 1, width, clip);
-}
-
-static void horiz_filter_clip_bayer(int16_t *output, int16_t *low, int16_t *high,
-                                    int width, int clip)
-{
-    filter(output, 2, low, 1, high, 1, width, clip);
-}
-
-static void vert_filter(int16_t *output, ptrdiff_t out_stride,
-                        int16_t *low, ptrdiff_t low_stride,
-                        int16_t *high, ptrdiff_t high_stride, int len)
-{
-    filter(output, out_stride, low, low_stride, high, high_stride, len, 0);
-}
-
 static void free_buffers(CFHDContext *s)
 {
     int i, j;
@@ -311,6 +244,8 @@  static int alloc_buffers(AVCodecContext *avctx)
         return ret;
     avctx->pix_fmt = s->coded_format;
 
+    ff_cfhddsp_init(&s->dsp, s->bpc, avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16);
+
     if ((ret = av_pix_fmt_get_chroma_sub_sample(s->coded_format,
                                                 &chroma_x_shift,
                                                 &chroma_y_shift)) < 0)
@@ -327,7 +262,7 @@  static int alloc_buffers(AVCodecContext *avctx)
         int w8, h8, w4, h4, w2, h2;
         int width  = (i || bayer) ? s->coded_width  >> chroma_x_shift : s->coded_width;
         int height = (i || bayer) ? s->coded_height >> chroma_y_shift : s->coded_height;
-        ptrdiff_t stride = FFALIGN(width  / 8, 8) * 8;
+        ptrdiff_t stride = (FFALIGN(width  / 8, 8) + 64) * 8;
 
         if (chroma_y_shift && !bayer)
             height = FFALIGN(height / 8, 2) * 8;
@@ -335,7 +270,7 @@  static int alloc_buffers(AVCodecContext *avctx)
         s->plane[i].height = height;
         s->plane[i].stride = stride;
 
-        w8 = FFALIGN(s->plane[i].width  / 8, 8);
+        w8 = FFALIGN(s->plane[i].width  / 8, 8) + 64;
         h8 = FFALIGN(height, 8) / 8;
         w4 = w8 * 2;
         h4 = h8 * 2;
@@ -430,6 +365,7 @@  static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                        AVPacket *avpkt)
 {
     CFHDContext *s = avctx->priv_data;
+    CFHDDSPContext *dsp = &s->dsp;
     GetByteContext gb;
     ThreadFrame frame = { .f = data };
     AVFrame *pic = data;
@@ -770,7 +706,7 @@  static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
             }
 
             if (lowpass_height > lowpass_a_height || lowpass_width > lowpass_a_width ||
-                lowpass_a_width * lowpass_a_height * sizeof(int16_t) > bytestream2_get_bytes_left(&gb)) {
+                lowpass_width * lowpass_height * sizeof(int16_t) > bytestream2_get_bytes_left(&gb)) {
                 av_log(avctx, AV_LOG_ERROR, "Too many lowpass coefficients\n");
                 ret = AVERROR(EINVAL);
                 goto end;
@@ -921,13 +857,6 @@  static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
 finish:
             if (s->subband_num_actual != 255)
                 s->codebook = 0;
-
-            /* Copy last line of coefficients if odd height */
-            if (highpass_height & 1) {
-                memcpy(&coeff_data[highpass_height * highpass_stride],
-                       &coeff_data[(highpass_height - 1) * highpass_stride],
-                       highpass_stride * sizeof(*coeff_data));
-            }
         }
     }
 
@@ -956,6 +885,7 @@  finish:
         for (plane = 0; plane < s->planes && !ret; plane++) {
             /* level 1 */
             int lowpass_height  = s->plane[plane].band[0][0].height;
+            int output_stride   = s->plane[plane].band[0][0].a_width;
             int lowpass_width   = s->plane[plane].band[0][0].width;
             int highpass_stride = s->plane[plane].band[0][1].stride;
             int act_plane = plane == 1 ? 2 : plane == 2 ? 1 : plane;
@@ -981,46 +911,31 @@  finish:
             low    = s->plane[plane].subband[0];
             high   = s->plane[plane].subband[2];
             output = s->plane[plane].l_h[0];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, lowpass_width, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].subband[1];
             high   = s->plane[plane].subband[3];
             output = s->plane[plane].l_h[1];
 
-            for (i = 0; i < lowpass_width; i++) {
-                // note the stride of "low" is highpass_stride
-                vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].l_h[0];
             high   = s->plane[plane].l_h[1];
             output = s->plane[plane].subband[0];
-            for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
-                low    += lowpass_width;
-                high   += lowpass_width;
-                output += lowpass_width * 2;
-            }
+            dsp->horiz_filter(output, output_stride, low, output_stride, high, output_stride, lowpass_width, lowpass_height * 2);
             if (s->bpc == 12) {
                 output = s->plane[plane].subband[0];
                 for (i = 0; i < lowpass_height * 2; i++) {
                     for (j = 0; j < lowpass_width * 2; j++)
                         output[j] *= 4;
 
-                    output += lowpass_width * 2;
+                    output += output_stride * 2;
                 }
             }
 
             /* level 2 */
             lowpass_height  = s->plane[plane].band[1][1].height;
+            output_stride   = s->plane[plane].band[1][1].a_width;
             lowpass_width   = s->plane[plane].band[1][1].width;
             highpass_stride = s->plane[plane].band[1][1].stride;
 
@@ -1036,43 +951,29 @@  finish:
             low    = s->plane[plane].subband[0];
             high   = s->plane[plane].subband[5];
             output = s->plane[plane].l_h[3];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].subband[4];
             high   = s->plane[plane].subband[6];
             output = s->plane[plane].l_h[4];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].l_h[3];
             high   = s->plane[plane].l_h[4];
             output = s->plane[plane].subband[0];
-            for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
-                low    += lowpass_width;
-                high   += lowpass_width;
-                output += lowpass_width * 2;
-            }
+            dsp->horiz_filter(output, output_stride, low, output_stride, high, output_stride, lowpass_width, lowpass_height * 2);
 
             output = s->plane[plane].subband[0];
             for (i = 0; i < lowpass_height * 2; i++) {
                 for (j = 0; j < lowpass_width * 2; j++)
                     output[j] *= 4;
 
-                output += lowpass_width * 2;
+                output += output_stride * 2;
             }
 
             /* level 3 */
             lowpass_height  = s->plane[plane].band[2][1].height;
+            output_stride   = s->plane[plane].band[2][1].a_width;
             lowpass_width   = s->plane[plane].band[2][1].width;
             highpass_stride = s->plane[plane].band[2][1].stride;
 
@@ -1088,22 +989,12 @@  finish:
                 low    = s->plane[plane].subband[0];
                 high   = s->plane[plane].subband[8];
                 output = s->plane[plane].l_h[6];
-                for (i = 0; i < lowpass_width; i++) {
-                    vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                    low++;
-                    high++;
-                    output++;
-                }
+                dsp->vert_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].subband[7];
                 high   = s->plane[plane].subband[9];
                 output = s->plane[plane].l_h[7];
-                for (i = 0; i < lowpass_width; i++) {
-                    vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                    low++;
-                    high++;
-                    output++;
-                }
+                dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 dst = (int16_t *)pic->data[act_plane];
                 if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16) {
@@ -1124,14 +1015,11 @@  finish:
                 }
 
                 for (i = 0; i < lowpass_height * 2; i++) {
-                    if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16)
-                        horiz_filter_clip_bayer(dst, low, high, lowpass_width, s->bpc);
-                    else
-                        horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                    dsp->horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
                     if (avctx->pix_fmt == AV_PIX_FMT_GBRAP12 && act_plane == 3)
                         process_alpha(dst, lowpass_width * 2);
-                    low  += lowpass_width;
-                    high += lowpass_width;
+                    low  += output_stride;
+                    high += output_stride;
                     dst  += dst_linesize;
                 }
             } else {
@@ -1140,30 +1028,20 @@  finish:
                 low    = s->plane[plane].subband[0];
                 high   = s->plane[plane].subband[7];
                 output = s->plane[plane].l_h[6];
-                for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
-                    low    += lowpass_width;
-                    high   += lowpass_width;
-                    output += lowpass_width * 2;
-                }
+                dsp->horiz_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].subband[8];
                 high   = s->plane[plane].subband[9];
                 output = s->plane[plane].l_h[7];
-                for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
-                    low    += lowpass_width;
-                    high   += lowpass_width;
-                    output += lowpass_width * 2;
-                }
+                dsp->horiz_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 dst  = (int16_t *)pic->data[act_plane];
                 low  = s->plane[plane].l_h[6];
                 high = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height; i++) {
                     interlaced_vertical_filter(dst, low, high, lowpass_width * 2,  pic->linesize[act_plane]/2, act_plane);
-                    low  += lowpass_width * 2;
-                    high += lowpass_width * 2;
+                    low  += output_stride * 2;
+                    high += output_stride * 2;
                     dst  += pic->linesize[act_plane];
                 }
             }
@@ -1171,6 +1049,7 @@  finish:
     } else if (s->transform_type == 2 && (avctx->internal->is_copy || s->frame_index == 1 || s->sample_type != 1)) {
         for (plane = 0; plane < s->planes && !ret; plane++) {
             int lowpass_height  = s->plane[plane].band[0][0].height;
+            int output_stride   = s->plane[plane].band[0][0].a_width;
             int lowpass_width   = s->plane[plane].band[0][0].width;
             int highpass_stride = s->plane[plane].band[0][1].stride;
             int act_plane = plane == 1 ? 2 : plane == 2 ? 1 : plane;
@@ -1196,43 +1075,29 @@  finish:
             low    = s->plane[plane].subband[0];
             high   = s->plane[plane].subband[2];
             output = s->plane[plane].l_h[0];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, lowpass_width, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].subband[1];
             high   = s->plane[plane].subband[3];
             output = s->plane[plane].l_h[1];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].l_h[0];
             high   = s->plane[plane].l_h[1];
             output = s->plane[plane].l_h[7];
-            for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
-                low    += lowpass_width;
-                high   += lowpass_width;
-                output += lowpass_width * 2;
-            }
+            dsp->horiz_filter(output, output_stride, low, output_stride, high, output_stride, lowpass_width, lowpass_height * 2);
             if (s->bpc == 12) {
                 output = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height * 2; i++) {
                     for (j = 0; j < lowpass_width * 2; j++)
                         output[j] *= 4;
 
-                    output += lowpass_width * 2;
+                    output += output_stride * 2;
                 }
             }
 
             lowpass_height  = s->plane[plane].band[1][1].height;
+            output_stride   = s->plane[plane].band[1][1].a_width;
             lowpass_width   = s->plane[plane].band[1][1].width;
             highpass_stride = s->plane[plane].band[1][1].stride;
 
@@ -1248,71 +1113,42 @@  finish:
             low    = s->plane[plane].l_h[7];
             high   = s->plane[plane].subband[5];
             output = s->plane[plane].l_h[3];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].subband[4];
             high   = s->plane[plane].subband[6];
             output = s->plane[plane].l_h[4];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].l_h[3];
             high   = s->plane[plane].l_h[4];
             output = s->plane[plane].l_h[7];
-            for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
-                low    += lowpass_width;
-                high   += lowpass_width;
-                output += lowpass_width * 2;
-            }
+            dsp->horiz_filter(output, output_stride, low, output_stride, high, output_stride, lowpass_width, lowpass_height * 2);
 
             output = s->plane[plane].l_h[7];
             for (i = 0; i < lowpass_height * 2; i++) {
                 for (j = 0; j < lowpass_width * 2; j++)
                     output[j] *= 4;
-                output += lowpass_width * 2;
+                output += output_stride * 2;
             }
 
             low    = s->plane[plane].subband[7];
             high   = s->plane[plane].subband[9];
             output = s->plane[plane].l_h[3];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].subband[8];
             high   = s->plane[plane].subband[10];
             output = s->plane[plane].l_h[4];
-            for (i = 0; i < lowpass_width; i++) {
-                vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                low++;
-                high++;
-                output++;
-            }
+            dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
             low    = s->plane[plane].l_h[3];
             high   = s->plane[plane].l_h[4];
             output = s->plane[plane].l_h[9];
-            for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
-                low    += lowpass_width;
-                high   += lowpass_width;
-                output += lowpass_width * 2;
-            }
+            dsp->horiz_filter(output, output_stride, low, output_stride, high, output_stride, lowpass_width, lowpass_height * 2);
 
             lowpass_height  = s->plane[plane].band[4][1].height;
+            output_stride   = s->plane[plane].band[4][1].a_width;
             lowpass_width   = s->plane[plane].band[4][1].width;
             highpass_stride = s->plane[plane].band[4][1].stride;
             av_log(avctx, AV_LOG_DEBUG, "temporal level %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
@@ -1328,50 +1164,30 @@  finish:
             high   = s->plane[plane].l_h[9];
             output = s->plane[plane].l_h[7];
             for (i = 0; i < lowpass_height; i++) {
-                inverse_temporal_filter(output, low, high, lowpass_width);
-                low    += lowpass_width;
-                high   += lowpass_width;
+                inverse_temporal_filter(low, high, lowpass_width);
+                low    += output_stride;
+                high   += output_stride;
             }
             if (s->progressive) {
                 low    = s->plane[plane].l_h[7];
                 high   = s->plane[plane].subband[15];
                 output = s->plane[plane].l_h[6];
-                for (i = 0; i < lowpass_width; i++) {
-                    vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                    low++;
-                    high++;
-                    output++;
-                }
+                dsp->vert_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].subband[14];
                 high   = s->plane[plane].subband[16];
                 output = s->plane[plane].l_h[7];
-                for (i = 0; i < lowpass_width; i++) {
-                    vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                    low++;
-                    high++;
-                    output++;
-                }
+                dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].l_h[9];
                 high   = s->plane[plane].subband[12];
                 output = s->plane[plane].l_h[8];
-                for (i = 0; i < lowpass_width; i++) {
-                    vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
-                    low++;
-                    high++;
-                    output++;
-                }
+                dsp->vert_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].subband[11];
                 high   = s->plane[plane].subband[13];
                 output = s->plane[plane].l_h[9];
-                for (i = 0; i < lowpass_width; i++) {
-                    vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
-                    low++;
-                    high++;
-                    output++;
-                }
+                dsp->vert_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 if (s->sample_type == 1)
                     continue;
@@ -1395,12 +1211,9 @@  finish:
                 low  = s->plane[plane].l_h[6];
                 high = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height * 2; i++) {
-                    if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16)
-                        horiz_filter_clip_bayer(dst, low, high, lowpass_width, s->bpc);
-                    else
-                        horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
-                    low  += lowpass_width;
-                    high += lowpass_width;
+                    dsp->horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                    low  += output_stride;
+                    high += output_stride;
                     dst  += dst_linesize;
                 }
             } else {
@@ -1408,42 +1221,22 @@  finish:
                 low    = s->plane[plane].l_h[7];
                 high   = s->plane[plane].subband[14];
                 output = s->plane[plane].l_h[6];
-                for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
-                    low    += lowpass_width;
-                    high   += lowpass_width;
-                    output += lowpass_width * 2;
-                }
+                dsp->horiz_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].subband[15];
                 high   = s->plane[plane].subband[16];
                 output = s->plane[plane].l_h[7];
-                for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
-                    low    += lowpass_width;
-                    high   += lowpass_width;
-                    output += lowpass_width * 2;
-                }
+                dsp->horiz_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].l_h[9];
                 high   = s->plane[plane].subband[11];
                 output = s->plane[plane].l_h[8];
-                for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
-                    low    += lowpass_width;
-                    high   += lowpass_width;
-                    output += lowpass_width * 2;
-                }
+                dsp->horiz_filter(output, output_stride, low, output_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 low    = s->plane[plane].subband[12];
                 high   = s->plane[plane].subband[13];
                 output = s->plane[plane].l_h[9];
-                for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
-                    low    += lowpass_width;
-                    high   += lowpass_width;
-                    output += lowpass_width * 2;
-                }
+                dsp->horiz_filter(output, output_stride, low, highpass_stride, high, highpass_stride, lowpass_width, lowpass_height);
 
                 if (s->sample_type == 1)
                     continue;
@@ -1453,8 +1246,8 @@  finish:
                 high = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height; i++) {
                     interlaced_vertical_filter(dst, low, high, lowpass_width * 2,  pic->linesize[act_plane]/2, act_plane);
-                    low  += lowpass_width * 2;
-                    high += lowpass_width * 2;
+                    low  += output_stride * 2;
+                    high += output_stride * 2;
                     dst  += pic->linesize[act_plane];
                 }
             }
@@ -1463,7 +1256,7 @@  finish:
 
     if (s->transform_type == 2 && s->sample_type == 1) {
         int16_t *low, *high, *dst;
-        int lowpass_height, lowpass_width, highpass_stride;
+        int output_stride, lowpass_height, lowpass_width, highpass_stride;
         ptrdiff_t dst_linesize;
 
         for (plane = 0; plane < s->planes; plane++) {
@@ -1477,6 +1270,7 @@  finish:
             }
 
             lowpass_height  = s->plane[plane].band[4][1].height;
+            output_stride   = s->plane[plane].band[4][1].a_width;
             lowpass_width   = s->plane[plane].band[4][1].width;
             highpass_stride = s->plane[plane].band[4][1].stride;
 
@@ -1501,12 +1295,9 @@  finish:
                 }
 
                 for (i = 0; i < lowpass_height * 2; i++) {
-                    if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16)
-                        horiz_filter_clip_bayer(dst, low, high, lowpass_width, s->bpc);
-                    else
-                        horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
-                    low  += lowpass_width;
-                    high += lowpass_width;
+                    dsp->horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                    low  += output_stride;
+                    high += output_stride;
                     dst  += dst_linesize;
                 }
             } else {
@@ -1515,8 +1306,8 @@  finish:
                 high = s->plane[plane].l_h[9];
                 for (i = 0; i < lowpass_height; i++) {
                     interlaced_vertical_filter(dst, low, high, lowpass_width * 2,  pic->linesize[act_plane]/2, act_plane);
-                    low  += lowpass_width * 2;
-                    high += lowpass_width * 2;
+                    low  += output_stride * 2;
+                    high += output_stride * 2;
                     dst  += pic->linesize[act_plane];
                 }
             }
diff --git a/libavcodec/cfhd.h b/libavcodec/cfhd.h
index dc329b724b..fdc6f1e546 100644
--- a/libavcodec/cfhd.h
+++ b/libavcodec/cfhd.h
@@ -29,6 +29,7 @@ 
 #include "bytestream.h"
 #include "get_bits.h"
 #include "vlc.h"
+#include "cfhddsp.h"
 
 enum CFHDParam {
     SampleType       =   1,
@@ -178,6 +179,8 @@  typedef struct CFHDContext {
     uint8_t prescale_table[8];
     Plane plane[4];
     Peak peak;
+
+    CFHDDSPContext dsp;
 } CFHDContext;
 
 int ff_cfhd_init_vlcs(CFHDContext *s);
diff --git a/libavcodec/cfhddsp.c b/libavcodec/cfhddsp.c
new file mode 100644
index 0000000000..4028263f7a
--- /dev/null
+++ b/libavcodec/cfhddsp.c
@@ -0,0 +1,118 @@ 
+/*
+ * Copyright (c) 2015-2016 Kieran Kunhya <kieran@kunhya.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/avassert.h"
+
+#include "cfhddsp.h"
+
+static av_always_inline void filter(int16_t *output, ptrdiff_t out_stride,
+                          const int16_t *low, ptrdiff_t low_stride,
+                          const int16_t *high, ptrdiff_t high_stride,
+                          int len, int clip)
+{
+    int16_t tmp;
+    int i;
+
+    tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3;
+    output[(2*0+0)*out_stride] = (tmp + high[0*high_stride]) >> 1;
+    if (clip)
+        output[(2*0+0)*out_stride] = av_clip_uintp2_c(output[(2*0+0)*out_stride], clip);
+
+    tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3;
+    output[(2*0+1)*out_stride] = (tmp - high[0*high_stride]) >> 1;
+    if (clip)
+        output[(2*0+1)*out_stride] = av_clip_uintp2_c(output[(2*0+1)*out_stride], clip);
+
+    for (i = 1; i < len - 1; i++) {
+        tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3;
+        output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1;
+        if (clip)
+            output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+        tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3;
+        output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1;
+        if (clip)
+            output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+    }
+
+    tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3;
+    output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1;
+    if (clip)
+        output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+    tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3;
+    output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1;
+    if (clip)
+        output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+}
+
+static void vert_filter(int16_t *output, ptrdiff_t out_stride,
+                        const int16_t *low, ptrdiff_t low_stride,
+                        const int16_t *high, ptrdiff_t high_stride,
+                        int width, int height)
+{
+    for (int i = 0; i < width; i++) {
+        filter(output, out_stride, low, low_stride, high, high_stride, height, 0);
+        low++;
+        high++;
+        output++;
+    }
+}
+
+static void horiz_filter(int16_t *output, ptrdiff_t ostride,
+                         const int16_t *low, ptrdiff_t lstride,
+                         const int16_t *high, ptrdiff_t hstride,
+                         int width, int height)
+{
+    for (int i = 0; i < height; i++) {
+        filter(output, 1, low, 1, high, 1, width, 0);
+        low    += lstride;
+        high   += hstride;
+        output += ostride * 2;
+    }
+}
+
+static void horiz_filter_clip(int16_t *output, const int16_t *low, const int16_t *high,
+                              int width, int clip)
+{
+    filter(output, 1, low, 1, high, 1, width, clip);
+}
+
+static void horiz_filter_clip_bayer(int16_t *output, const int16_t *low, const int16_t *high,
+                                    int width, int clip)
+{
+    filter(output, 2, low, 1, high, 1, width, clip);
+}
+
+av_cold void ff_cfhddsp_init(CFHDDSPContext *c, int depth, int bayer)
+{
+    c->horiz_filter = horiz_filter;
+    c->vert_filter = vert_filter;
+
+    if (bayer)
+        c->horiz_filter_clip = horiz_filter_clip_bayer;
+    else
+        c->horiz_filter_clip = horiz_filter_clip;
+
+    if (ARCH_X86)
+        ff_cfhddsp_init_x86(c, depth, bayer);
+}
diff --git a/libavcodec/cfhddsp.h b/libavcodec/cfhddsp.h
new file mode 100644
index 0000000000..8737eb3242
--- /dev/null
+++ b/libavcodec/cfhddsp.h
@@ -0,0 +1,44 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CFHDDSP_H
+#define AVCODEC_CFHDDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct CFHDDSPContext {
+    void (*horiz_filter)(int16_t *output, ptrdiff_t out_stride,
+                         const int16_t *low, ptrdiff_t low_stride,
+                         const int16_t *high, ptrdiff_t high_stride,
+                         int width, int height);
+
+    void (*vert_filter)(int16_t *output, ptrdiff_t out_stride,
+                        const int16_t *low, ptrdiff_t low_stride,
+                        const int16_t *high, ptrdiff_t high_stride,
+                        int width, int height);
+
+    void (*horiz_filter_clip)(int16_t *output, const int16_t *low, const int16_t *high,
+                              int width, int bpc);
+} CFHDDSPContext;
+
+void ff_cfhddsp_init(CFHDDSPContext *c, int format, int bayer);
+
+void ff_cfhddsp_init_x86(CFHDDSPContext *c, int format, int bayer);
+
+#endif /* AVCODEC_CFHDDSP_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 194135dafb..884dc0c759 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -50,6 +50,7 @@  OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
 OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
+OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
@@ -153,6 +154,7 @@  X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
 X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
                                           x86/dirac_dwt.o
diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
new file mode 100644
index 0000000000..38d5a43890
--- /dev/null
+++ b/libavcodec/x86/cfhddsp.asm
@@ -0,0 +1,619 @@ 
+;******************************************************************************
+;* x86-optimized functions for the CFHD decoder
+;* Copyright (c) 2020 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+factor_p1_p1: dw 1,  1, 1,  1, 1,  1, 1,  1,
+factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
+factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
+factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
+factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
+pd_4: times 4 dd 4
+pw_0: times 8 dw 0
+pw_1023: times 8 dw 1023
+pw_4095: times 8 dw 4095
+
+SECTION .text
+
+%macro CFHD_HORIZ_FILTER 1
+%if %1 == 1023
+cglobal cfhd_horiz_filter_clip10, 5, 6, 8, output, low, high, width, bpc
+    DEFINE_ARGS    output, low, high, width, x, temp
+    shl        widthd, 1
+%define ostrideq widthq
+%define lwidthq  widthq
+%define hwidthq  widthq
+%elif %1 == 4095
+cglobal cfhd_horiz_filter_clip12, 5, 6, 8, output, low, high, width, bpc
+    DEFINE_ARGS    output, low, high, width, x, temp
+    shl        widthd, 1
+%define ostrideq widthq
+%define lwidthq  widthq
+%define hwidthq  widthq
+%else
+%if ARCH_X86_64
+cglobal cfhd_horiz_filter, 11, 11, 8, output, ostride, low, lwidth, high, hwidth, width, height
+DEFINE_ARGS    output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
+    shl  ostrided, 1
+    shl   lwidthd, 1
+    shl   hwidthd, 1
+    shl    widthd, 1
+
+    mov        yq, heightq
+    neg        yq
+%else
+cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
+    shl        xd, 1
+    shl        yd, 1
+    shl     tempd, 1
+    shl    widthd, 1
+
+    mov       xmp, xq
+    mov       ymp, yq
+    mov    tempmp, tempq
+
+    mov        yd, r7m
+    neg        yq
+
+%define ostrideq xm
+%define lwidthq  ym
+%define hwidthq  tempm
+%endif
+%endif
+
+%if %1 == 0
+.looph:
+%endif
+    movsx          xq, word [lowq]
+    imul           xq, 11
+
+    movsx       tempq, word [lowq + 2]
+    imul        tempq, -4
+    add         tempq, xq
+
+    movsx          xq, word [lowq + 4]
+    add         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq]
+    add         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd          xm0, tempd
+    CLIPW          m0, [pw_0], [pw_%1]
+    pextrw      tempd, xm0, 0
+%endif
+    mov  word [outputq], tempw
+
+    movsx          xq, word [lowq]
+    imul           xq, 5
+
+    movsx       tempq, word [lowq + 2]
+    imul        tempq, 4
+    add         tempq, xq
+
+    movsx          xq, word [lowq + 4]
+    sub         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq]
+    sub         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd          xm0, tempd
+    CLIPW          m0, [pw_0], [pw_%1]
+    pextrw      tempd, xm0, 0
+%endif
+    mov  word [outputq + 2], tempw
+
+    mov            xq, 0
+
+.loop:
+    movu           m4, [lowq + xq]
+    movu           m1, [lowq + xq + 4]
+
+    mova           m5, m4
+    punpcklwd      m4, m1
+    punpckhwd      m5, m1
+
+    mova           m6, m4
+    mova           m7, m5
+
+    pmaddwd        m4, [factor_p1_n1]
+    pmaddwd        m5, [factor_p1_n1]
+    pmaddwd        m6, [factor_n1_p1]
+    pmaddwd        m7, [factor_n1_p1]
+
+    paddd          m4, [pd_4]
+    paddd          m5, [pd_4]
+    paddd          m6, [pd_4]
+    paddd          m7, [pd_4]
+
+    psrad          m4, 3
+    psrad          m5, 3
+    psrad          m6, 3
+    psrad          m7, 3
+
+    movu           m2, [lowq + xq + 2]
+    movu           m3, [highq + xq + 2]
+
+    mova           m0, m2
+    punpcklwd      m2, m3
+    punpckhwd      m0, m3
+
+    mova           m1, m2
+    mova           m3, m0
+
+    pmaddwd        m2, [factor_p1_p1]
+    pmaddwd        m0, [factor_p1_p1]
+    pmaddwd        m1, [factor_p1_n1]
+    pmaddwd        m3, [factor_p1_n1]
+
+    paddd          m2, m4
+    paddd          m0, m5
+    paddd          m1, m6
+    paddd          m3, m7
+
+    psrad          m2, 1
+    psrad          m0, 1
+    psrad          m1, 1
+    psrad          m3, 1
+
+    packssdw       m2, m0
+    packssdw       m1, m3
+
+    mova           m0, m2
+    punpcklwd      m2, m1
+    punpckhwd      m0, m1
+
+%if %1
+    CLIPW          m2, [pw_0], [pw_%1]
+    CLIPW          m0, [pw_0], [pw_%1]
+%endif
+
+    movu  [outputq + xq * 2 + 4], m2
+    movu  [outputq + xq * 2 + mmsize + 4], m0
+
+    add            xq, mmsize
+    cmp            xq, widthq
+    jl .loop
+
+    add          lowq, widthq
+    add         highq, widthq
+    add       outputq, widthq
+    add       outputq, widthq
+
+    movsx          xq, word [lowq - 2]
+    imul           xq, 5
+
+    movsx       tempq, word [lowq - 4]
+    imul        tempq, 4
+    add         tempq, xq
+
+    movsx          xq, word [lowq - 6]
+    sub         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq - 2]
+    add         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd          xm0, tempd
+    CLIPW          m0, [pw_0], [pw_%1]
+    pextrw      tempd, xm0, 0
+%endif
+    mov  word [outputq - 4], tempw
+
+    movsx          xq, word [lowq - 2]
+    imul           xq, 11
+
+    movsx       tempq, word [lowq - 4]
+    imul        tempq, -4
+    add         tempq, xq
+
+    movsx          xq, word [lowq - 6]
+    add         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq - 2]
+    sub         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd          xm0, tempd
+    CLIPW          m0, [pw_0], [pw_%1]
+    pextrw      tempd, xm0, 0
+%endif
+    mov  word [outputq - 2], tempw
+
+%if %1 == 0
+    sub          lowq, widthq
+    sub         highq, widthq
+    sub       outputq, widthq
+    sub       outputq, widthq
+
+    add          lowq, lwidthq
+    add         highq, hwidthq
+    add       outputq, ostrideq
+    add       outputq, ostrideq
+    add            yq, 1
+    jl .looph
+%endif
+
+    RET
+%endmacro
+
+INIT_XMM sse2
+CFHD_HORIZ_FILTER 0
+
+INIT_XMM sse2
+CFHD_HORIZ_FILTER 1023
+
+INIT_XMM sse2
+CFHD_HORIZ_FILTER 4095
+
+INIT_XMM sse2
+%if ARCH_X86_64
+cglobal cfhd_vert_filter, 11, 11, 8, output, ostride, low, lwidth, high, hwidth, width, height
+DEFINE_ARGS    output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
+    shl        ostrided, 1
+    shl         lwidthd, 1
+    shl         hwidthd, 1
+    shl          widthd, 1
+
+    dec   heightq
+%else
+cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
+    shl        xd, 1
+    shl        yd, 1
+    shl      posd, 1
+    shl    widthd, 1
+
+    mov       xmp, xq
+    mov       ymp, yq
+    mov     posmp, posq
+
+    mov        xq, r7m
+    dec        xq
+    mov   widthmp, xq
+
+%define ostrideq xm
+%define lwidthq  ym
+%define hwidthq  posm
+%define heightq  widthm
+
+%endif
+
+    xor        xq, xq
+.loopw:
+    xor        yq, yq
+
+    mov      posq, xq
+    movu       m0, [lowq + posq]
+    add      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m2, m0
+    punpcklwd  m0, m1
+    punpckhwd  m2, m1
+
+    pmaddwd    m0, [factor_p11_n4]
+    pmaddwd    m2, [factor_p11_n4]
+
+    pxor       m4, m4
+    add      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    paddd      m0, m4
+    paddd      m2, m3
+
+    paddd      m0, [pd_4]
+    paddd      m2, [pd_4]
+
+    psrad      m0, 3
+    psrad      m2, 3
+
+    mov      posq, xq
+    pxor       m4, m4
+    movu       m1, [highq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    paddd      m0, m4
+    paddd      m2, m3
+
+    psrad      m0, 1
+    psrad      m2, 1
+
+    packssdw   m0, m2
+
+    movu    [outputq + posq], m0
+
+    movu       m0, [lowq + posq]
+    add      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m2, m0
+    punpcklwd  m0, m1
+    punpckhwd  m2, m1
+
+    pmaddwd    m0, [factor_p5_p4]
+    pmaddwd    m2, [factor_p5_p4]
+
+    pxor       m4, m4
+    add      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    psubd      m0, m4
+    psubd      m2, m3
+
+    paddd      m0, [pd_4]
+    paddd      m2, [pd_4]
+
+    psrad      m0, 3
+    psrad      m2, 3
+
+    mov      posq, xq
+    pxor       m4, m4
+    movu       m1, [highq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    psubd      m0, m4
+    psubd      m2, m3
+
+    psrad      m0, 1
+    psrad      m2, 1
+
+    packssdw   m0, m2
+
+    add      posq, ostrideq
+    movu    [outputq + posq], m0
+
+    add        yq, 1
+.looph:
+    mov      posq, lwidthq
+    imul     posq, yq
+    sub      posq, lwidthq
+    add      posq, xq
+
+    movu       m4, [lowq + posq]
+
+    add      posq, lwidthq
+    add      posq, lwidthq
+    movu       m1, [lowq + posq]
+
+    mova       m5, m4
+    punpcklwd  m4, m1
+    punpckhwd  m5, m1
+
+    mova       m6, m4
+    mova       m7, m5
+
+    pmaddwd    m4, [factor_p1_n1]
+    pmaddwd    m5, [factor_p1_n1]
+    pmaddwd    m6, [factor_n1_p1]
+    pmaddwd    m7, [factor_n1_p1]
+
+    paddd      m4, [pd_4]
+    paddd      m5, [pd_4]
+    paddd      m6, [pd_4]
+    paddd      m7, [pd_4]
+
+    psrad      m4, 3
+    psrad      m5, 3
+    psrad      m6, 3
+    psrad      m7, 3
+
+    sub      posq, lwidthq
+    movu       m0, [lowq + posq]
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+    movu       m1, [highq + posq]
+
+    mova       m2, m0
+    punpcklwd  m0, m1
+    punpckhwd  m2, m1
+
+    mova       m1, m0
+    mova       m3, m2
+
+    pmaddwd    m0, [factor_p1_p1]
+    pmaddwd    m2, [factor_p1_p1]
+    pmaddwd    m1, [factor_p1_n1]
+    pmaddwd    m3, [factor_p1_n1]
+
+    paddd      m0, m4
+    paddd      m2, m5
+    paddd      m1, m6
+    paddd      m3, m7
+
+    psrad      m0, 1
+    psrad      m2, 1
+    psrad      m1, 1
+    psrad      m3, 1
+
+    packssdw   m0, m2
+    packssdw   m1, m3
+
+    mov      posq, ostrideq
+    imul     posq, 2
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [outputq + posq], m0
+    add      posq, ostrideq
+    movu    [outputq + posq], m1
+
+    add        yq, 1
+    cmp        yq, heightq
+    jl .looph
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+    movu       m0, [lowq + posq]
+    sub      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m2, m0
+    punpcklwd  m0, m1
+    punpckhwd  m2, m1
+
+    pmaddwd    m0, [factor_p5_p4]
+    pmaddwd    m2, [factor_p5_p4]
+
+    pxor       m4, m4
+    sub      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    psubd      m0, m4
+    psubd      m2, m3
+
+    paddd      m0, [pd_4]
+    paddd      m2, [pd_4]
+
+    psrad      m0, 3
+    psrad      m2, 3
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+    pxor       m4, m4
+    movu       m1, [highq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    paddd      m0, m4
+    paddd      m2, m3
+
+    psrad      m0, 1
+    psrad      m2, 1
+
+    packssdw   m0, m2
+
+    mov      posq, ostrideq
+    imul     posq, 2
+    imul     posq, yq
+    add      posq, xq
+    movu    [outputq + posq], m0
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+    movu       m0, [lowq + posq]
+    sub      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m2, m0
+    punpcklwd  m0, m1
+    punpckhwd  m2, m1
+
+    pmaddwd    m0, [factor_p11_n4]
+    pmaddwd    m2, [factor_p11_n4]
+
+    pxor       m4, m4
+    sub      posq, lwidthq
+    movu       m1, [lowq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    paddd      m0, m4
+    paddd      m2, m3
+
+    paddd      m0, [pd_4]
+    paddd      m2, [pd_4]
+
+    psrad      m0, 3
+    psrad      m2, 3
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+    pxor       m4, m4
+    movu       m1, [highq + posq]
+    mova       m3, m4
+    punpcklwd  m4, m1
+    punpckhwd  m3, m1
+
+    psrad      m4, 16
+    psrad      m3, 16
+
+    psubd      m0, m4
+    psubd      m2, m3
+
+    psrad      m0, 1
+    psrad      m2, 1
+
+    packssdw   m0, m2
+
+    mov      posq, ostrideq
+    imul     posq, 2
+    imul     posq, yq
+    add      posq, ostrideq
+    add      posq, xq
+    movu    [outputq + posq], m0
+
+    add        xq, mmsize
+    cmp        xq, widthq
+    jl .loopw
+    RET
diff --git a/libavcodec/x86/cfhddsp_init.c b/libavcodec/x86/cfhddsp_init.c
new file mode 100644
index 0000000000..ab7ff83087
--- /dev/null
+++ b/libavcodec/x86/cfhddsp_init.c
@@ -0,0 +1,52 @@ 
+/*
+ * Copyright (c) 2020 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/cfhddsp.h"
+
+void ff_cfhd_horiz_filter_sse2(int16_t *output, ptrdiff_t out_stride,
+                               const int16_t *low, ptrdiff_t low_stride,
+                               const int16_t *high, ptrdiff_t high_stride,
+                               int width, int height);
+void ff_cfhd_vert_filter_sse2(int16_t *output, ptrdiff_t out_stride,
+                              const int16_t *low, ptrdiff_t low_stride,
+                              const int16_t *high, ptrdiff_t high_stride,
+                              int width, int height);
+void ff_cfhd_horiz_filter_clip10_sse2(int16_t *output, const int16_t *low, const int16_t *high, int width, int bpc);
+void ff_cfhd_horiz_filter_clip12_sse2(int16_t *output, const int16_t *low, const int16_t *high, int width, int bpc);
+
+av_cold void ff_cfhddsp_init_x86(CFHDDSPContext *c, int depth, int bayer)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->horiz_filter = ff_cfhd_horiz_filter_sse2;
+        c->vert_filter = ff_cfhd_vert_filter_sse2;
+        if (depth == 10 && !bayer)
+            c->horiz_filter_clip = ff_cfhd_horiz_filter_clip10_sse2;
+        if (depth == 12 && !bayer)
+            c->horiz_filter_clip = ff_cfhd_horiz_filter_clip12_sse2;
+    }
+}