diff mbox

[FFmpeg-devel,V2] lavfi/hqdn3d: add slice thread optimization

Message ID 1570542340-9853-1-git-send-email-mypopydev@gmail.com
State Superseded
Headers show

Commit Message

Jun Zhao Oct. 8, 2019, 1:45 p.m. UTC
From: Jun Zhao <barryjzhao@tencent.com>

Enabled one thread per planar, used the test command for 1080P video
(YUV420P format) as follow:

ffmpeg -i 1080p.mp4 -an -vf hqdn3d -f null /dev/nul

This optimization improved the performance about 30% in 1080P YUV420P
case (from 110fps to 143fps), also pass the framemd5 check and FATE.

Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
---
 libavfilter/vf_hqdn3d.c |   56 +++++++++++++++++++++++++++++++++-------------
 libavfilter/vf_hqdn3d.h |    2 +-
 2 files changed, 41 insertions(+), 17 deletions(-)

Comments

Paul B Mahol Oct. 8, 2019, 2:57 p.m. UTC | #1
On 10/8/19, Jun Zhao <mypopydev@gmail.com> wrote:
> From: Jun Zhao <barryjzhao@tencent.com>
>
> Enabled one thread per planar, used the test command for 1080P video
> (YUV420P format) as follow:
>
> ffmpeg -i 1080p.mp4 -an -vf hqdn3d -f null /dev/nul
>
> This optimization improved the performance about 30% in 1080P YUV420P
> case (from 110fps to 143fps), also pass the framemd5 check and FATE.
>
> Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
> ---
>  libavfilter/vf_hqdn3d.c |   56
> +++++++++++++++++++++++++++++++++-------------
>  libavfilter/vf_hqdn3d.h |    2 +-
>  2 files changed, 41 insertions(+), 17 deletions(-)
>
> diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
> index d6c14bb..6dd8fd8 100644
> --- a/libavfilter/vf_hqdn3d.c
> +++ b/libavfilter/vf_hqdn3d.c
> @@ -223,7 +223,9 @@ static av_cold void uninit(AVFilterContext *ctx)
>      av_freep(&s->coefs[1]);
>      av_freep(&s->coefs[2]);
>      av_freep(&s->coefs[3]);
> -    av_freep(&s->line);
> +    av_freep(&s->line[0]);
> +    av_freep(&s->line[1]);
> +    av_freep(&s->line[2]);
>      av_freep(&s->frame_prev[0]);
>      av_freep(&s->frame_prev[1]);
>      av_freep(&s->frame_prev[2]);
> @@ -271,9 +273,11 @@ static int config_input(AVFilterLink *inlink)
>      s->vsub  = desc->log2_chroma_h;
>      s->depth = desc->comp[0].depth;
>
> -    s->line = av_malloc_array(inlink->w, sizeof(*s->line));
> -    if (!s->line)
> -        return AVERROR(ENOMEM);
> +    for (i = 0; i < 3; i++) {
> +        s->line[i] = av_malloc_array(inlink->w, sizeof(*s->line[i]));
> +        if (!s->line[i])
> +            return AVERROR(ENOMEM);
> +    }
>
>      for (i = 0; i < 4; i++) {
>          s->coefs[i] = precalc_coefs(s->strength[i], s->depth);
> @@ -287,14 +291,38 @@ static int config_input(AVFilterLink *inlink)
>      return 0;
>  }
>
> +struct ThreadData {
> +    AVFrame *in, *out;
> +    int direct;
> +};
> +
> +static int do_denoise(AVFilterContext *ctx, void *data, int job_nr, int
> n_jobs)
> +{
> +    HQDN3DContext *s = ctx->priv;
> +    struct ThreadData *td = data;
> +    AVFrame *out = td->out;
> +    AVFrame *in = td->in;
> +    int direct = td->direct;

looks unused.

> +
> +    denoise(s, in->data[job_nr], out->data[job_nr],
> +                s->line[job_nr], &s->frame_prev[job_nr],
> +                AV_CEIL_RSHIFT(in->width,  (!!job_nr * s->hsub)),
> +                AV_CEIL_RSHIFT(in->height, (!!job_nr * s->vsub)),
> +                in->linesize[job_nr], out->linesize[job_nr],
> +                s->coefs[job_nr ? CHROMA_SPATIAL : LUMA_SPATIAL],
> +                s->coefs[job_nr ? CHROMA_TMP     : LUMA_TMP]);
> +
> +    return 0;
> +}
> +
>  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>  {
>      AVFilterContext *ctx  = inlink->dst;
> -    HQDN3DContext *s = ctx->priv;
>      AVFilterLink *outlink = ctx->outputs[0];
>
>      AVFrame *out;
> -    int c, direct = av_frame_is_writable(in) && !ctx->is_disabled;
> +    int direct = av_frame_is_writable(in) && !ctx->is_disabled;
> +    struct ThreadData td;
>
>      if (direct) {
>          out = in;
> @@ -308,15 +336,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
> *in)
>          av_frame_copy_props(out, in);
>      }
>
> -    for (c = 0; c < 3; c++) {
> -        denoise(s, in->data[c], out->data[c],
> -                s->line, &s->frame_prev[c],
> -                AV_CEIL_RSHIFT(in->width,  (!!c * s->hsub)),
> -                AV_CEIL_RSHIFT(in->height, (!!c * s->vsub)),
> -                in->linesize[c], out->linesize[c],
> -                s->coefs[c ? CHROMA_SPATIAL : LUMA_SPATIAL],
> -                s->coefs[c ? CHROMA_TMP     : LUMA_TMP]);
> -    }
> +    td.in = in;
> +    td.out = out;
> +    td.direct = direct;
> +    /* one thread per planar */
> +    ctx->internal->execute(ctx, do_denoise, &td, NULL, 3);
>
>      if (ctx->is_disabled) {
>          av_frame_free(&out);
> @@ -370,5 +394,5 @@ AVFilter ff_vf_hqdn3d = {
>      .query_formats = query_formats,
>      .inputs        = avfilter_vf_hqdn3d_inputs,
>      .outputs       = avfilter_vf_hqdn3d_outputs,
> -    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
> +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |
> AVFILTER_FLAG_SLICE_THREADS,
>  };
> diff --git a/libavfilter/vf_hqdn3d.h b/libavfilter/vf_hqdn3d.h
> index 03a79a1..3279bbc 100644
> --- a/libavfilter/vf_hqdn3d.h
> +++ b/libavfilter/vf_hqdn3d.h
> @@ -31,7 +31,7 @@
>  typedef struct HQDN3DContext {
>      const AVClass *class;
>      int16_t *coefs[4];
> -    uint16_t *line;
> +    uint16_t *line[3];
>      uint16_t *frame_prev[3];
>      double strength[4];
>      int hsub, vsub;
> --
> 1.7.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
mypopy@gmail.com Oct. 9, 2019, 1:03 a.m. UTC | #2
On Tue, Oct 8, 2019 at 10:57 PM Paul B Mahol <onemda@gmail.com> wrote:
>
> On 10/8/19, Jun Zhao <mypopydev@gmail.com> wrote:
> > From: Jun Zhao <barryjzhao@tencent.com>
> >
> > Enabled one thread per planar, used the test command for 1080P video
> > (YUV420P format) as follow:
> >
> > ffmpeg -i 1080p.mp4 -an -vf hqdn3d -f null /dev/nul
> >
> > This optimization improved the performance about 30% in 1080P YUV420P
> > case (from 110fps to 143fps), also pass the framemd5 check and FATE.
> >
> > Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
> > ---
> >  libavfilter/vf_hqdn3d.c |   56
> > +++++++++++++++++++++++++++++++++-------------
> >  libavfilter/vf_hqdn3d.h |    2 +-
> >  2 files changed, 41 insertions(+), 17 deletions(-)
> >
> > diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
> > index d6c14bb..6dd8fd8 100644
> > --- a/libavfilter/vf_hqdn3d.c
> > +++ b/libavfilter/vf_hqdn3d.c
> > @@ -223,7 +223,9 @@ static av_cold void uninit(AVFilterContext *ctx)
> >      av_freep(&s->coefs[1]);
> >      av_freep(&s->coefs[2]);
> >      av_freep(&s->coefs[3]);
> > -    av_freep(&s->line);
> > +    av_freep(&s->line[0]);
> > +    av_freep(&s->line[1]);
> > +    av_freep(&s->line[2]);
> >      av_freep(&s->frame_prev[0]);
> >      av_freep(&s->frame_prev[1]);
> >      av_freep(&s->frame_prev[2]);
> > @@ -271,9 +273,11 @@ static int config_input(AVFilterLink *inlink)
> >      s->vsub  = desc->log2_chroma_h;
> >      s->depth = desc->comp[0].depth;
> >
> > -    s->line = av_malloc_array(inlink->w, sizeof(*s->line));
> > -    if (!s->line)
> > -        return AVERROR(ENOMEM);
> > +    for (i = 0; i < 3; i++) {
> > +        s->line[i] = av_malloc_array(inlink->w, sizeof(*s->line[i]));
> > +        if (!s->line[i])
> > +            return AVERROR(ENOMEM);
> > +    }
> >
> >      for (i = 0; i < 4; i++) {
> >          s->coefs[i] = precalc_coefs(s->strength[i], s->depth);
> > @@ -287,14 +291,38 @@ static int config_input(AVFilterLink *inlink)
> >      return 0;
> >  }
> >
> > +struct ThreadData {
> > +    AVFrame *in, *out;
> > +    int direct;
> > +};
> > +
> > +static int do_denoise(AVFilterContext *ctx, void *data, int job_nr, int
> > n_jobs)
> > +{
> > +    HQDN3DContext *s = ctx->priv;
> > +    struct ThreadData *td = data;
> > +    AVFrame *out = td->out;
> > +    AVFrame *in = td->in;
> > +    int direct = td->direct;
>
> looks unused.
 denoise is not a function, in fact, it's a MACRO, and the MACRO  need
the direct :)
>
> > +
> > +    denoise(s, in->data[job_nr], out->data[job_nr],
> > +                s->line[job_nr], &s->frame_prev[job_nr],
> > +                AV_CEIL_RSHIFT(in->width,  (!!job_nr * s->hsub)),
> > +                AV_CEIL_RSHIFT(in->height, (!!job_nr * s->vsub)),
> > +                in->linesize[job_nr], out->linesize[job_nr],
> > +                s->coefs[job_nr ? CHROMA_SPATIAL : LUMA_SPATIAL],
> > +                s->coefs[job_nr ? CHROMA_TMP     : LUMA_TMP]);
> > +
> > +    return 0;
> > +}
> > +
> >  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
> >  {
> >      AVFilterContext *ctx  = inlink->dst;
> > -    HQDN3DContext *s = ctx->priv;
> >      AVFilterLink *outlink = ctx->outputs[0];
> >
> >      AVFrame *out;
> > -    int c, direct = av_frame_is_writable(in) && !ctx->is_disabled;
> > +    int direct = av_frame_is_writable(in) && !ctx->is_disabled;
> > +    struct ThreadData td;
> >
> >      if (direct) {
> >          out = in;
> > @@ -308,15 +336,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
> > *in)
> >          av_frame_copy_props(out, in);
> >      }
> >
> > -    for (c = 0; c < 3; c++) {
> > -        denoise(s, in->data[c], out->data[c],
> > -                s->line, &s->frame_prev[c],
> > -                AV_CEIL_RSHIFT(in->width,  (!!c * s->hsub)),
> > -                AV_CEIL_RSHIFT(in->height, (!!c * s->vsub)),
> > -                in->linesize[c], out->linesize[c],
> > -                s->coefs[c ? CHROMA_SPATIAL : LUMA_SPATIAL],
> > -                s->coefs[c ? CHROMA_TMP     : LUMA_TMP]);
> > -    }
> > +    td.in = in;
> > +    td.out = out;
> > +    td.direct = direct;
> > +    /* one thread per planar */
> > +    ctx->internal->execute(ctx, do_denoise, &td, NULL, 3);
> >
> >      if (ctx->is_disabled) {
> >          av_frame_free(&out);
> > @@ -370,5 +394,5 @@ AVFilter ff_vf_hqdn3d = {
> >      .query_formats = query_formats,
> >      .inputs        = avfilter_vf_hqdn3d_inputs,
> >      .outputs       = avfilter_vf_hqdn3d_outputs,
> > -    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
> > +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |
> > AVFILTER_FLAG_SLICE_THREADS,
> >  };
> > diff --git a/libavfilter/vf_hqdn3d.h b/libavfilter/vf_hqdn3d.h
> > index 03a79a1..3279bbc 100644
> > --- a/libavfilter/vf_hqdn3d.h
> > +++ b/libavfilter/vf_hqdn3d.h
> > @@ -31,7 +31,7 @@
> >  typedef struct HQDN3DContext {
> >      const AVClass *class;
> >      int16_t *coefs[4];
> > -    uint16_t *line;
> > +    uint16_t *line[3];
> >      uint16_t *frame_prev[3];
> >      double strength[4];
> >      int hsub, vsub;
> > --
> > 1.7.1
> >
Paul B Mahol Oct. 9, 2019, 7:21 a.m. UTC | #3
On 10/8/19, Jun Zhao <mypopydev@gmail.com> wrote:
> From: Jun Zhao <barryjzhao@tencent.com>
>
> Enabled one thread per planar, used the test command for 1080P video
> (YUV420P format) as follow:
>
> ffmpeg -i 1080p.mp4 -an -vf hqdn3d -f null /dev/nul
>
> This optimization improved the performance about 30% in 1080P YUV420P
> case (from 110fps to 143fps), also pass the framemd5 check and FATE.
>
> Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
> ---
>  libavfilter/vf_hqdn3d.c |   56
> +++++++++++++++++++++++++++++++++-------------
>  libavfilter/vf_hqdn3d.h |    2 +-
>  2 files changed, 41 insertions(+), 17 deletions(-)
>
> diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
> index d6c14bb..6dd8fd8 100644
> --- a/libavfilter/vf_hqdn3d.c
> +++ b/libavfilter/vf_hqdn3d.c
> @@ -223,7 +223,9 @@ static av_cold void uninit(AVFilterContext *ctx)
>      av_freep(&s->coefs[1]);
>      av_freep(&s->coefs[2]);
>      av_freep(&s->coefs[3]);
> -    av_freep(&s->line);
> +    av_freep(&s->line[0]);
> +    av_freep(&s->line[1]);
> +    av_freep(&s->line[2]);
>      av_freep(&s->frame_prev[0]);
>      av_freep(&s->frame_prev[1]);
>      av_freep(&s->frame_prev[2]);
> @@ -271,9 +273,11 @@ static int config_input(AVFilterLink *inlink)
>      s->vsub  = desc->log2_chroma_h;
>      s->depth = desc->comp[0].depth;
>
> -    s->line = av_malloc_array(inlink->w, sizeof(*s->line));
> -    if (!s->line)
> -        return AVERROR(ENOMEM);
> +    for (i = 0; i < 3; i++) {
> +        s->line[i] = av_malloc_array(inlink->w, sizeof(*s->line[i]));
> +        if (!s->line[i])
> +            return AVERROR(ENOMEM);
> +    }
>
>      for (i = 0; i < 4; i++) {
>          s->coefs[i] = precalc_coefs(s->strength[i], s->depth);
> @@ -287,14 +291,38 @@ static int config_input(AVFilterLink *inlink)
>      return 0;
>  }
>
> +struct ThreadData {
> +    AVFrame *in, *out;
> +    int direct;
> +};

Please typedef properly this struct as all other filters do.

> +
> +static int do_denoise(AVFilterContext *ctx, void *data, int job_nr, int
> n_jobs)
> +{
> +    HQDN3DContext *s = ctx->priv;
> +    struct ThreadData *td = data;
> +    AVFrame *out = td->out;
> +    AVFrame *in = td->in;
> +    int direct = td->direct;
> +
> +    denoise(s, in->data[job_nr], out->data[job_nr],
> +                s->line[job_nr], &s->frame_prev[job_nr],
> +                AV_CEIL_RSHIFT(in->width,  (!!job_nr * s->hsub)),
> +                AV_CEIL_RSHIFT(in->height, (!!job_nr * s->vsub)),
> +                in->linesize[job_nr], out->linesize[job_nr],
> +                s->coefs[job_nr ? CHROMA_SPATIAL : LUMA_SPATIAL],
> +                s->coefs[job_nr ? CHROMA_TMP     : LUMA_TMP]);
> +
> +    return 0;
> +}
> +
>  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>  {
>      AVFilterContext *ctx  = inlink->dst;
> -    HQDN3DContext *s = ctx->priv;
>      AVFilterLink *outlink = ctx->outputs[0];
>
>      AVFrame *out;
> -    int c, direct = av_frame_is_writable(in) && !ctx->is_disabled;
> +    int direct = av_frame_is_writable(in) && !ctx->is_disabled;
> +    struct ThreadData td;
>
>      if (direct) {
>          out = in;
> @@ -308,15 +336,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
> *in)
>          av_frame_copy_props(out, in);
>      }
>
> -    for (c = 0; c < 3; c++) {
> -        denoise(s, in->data[c], out->data[c],
> -                s->line, &s->frame_prev[c],
> -                AV_CEIL_RSHIFT(in->width,  (!!c * s->hsub)),
> -                AV_CEIL_RSHIFT(in->height, (!!c * s->vsub)),
> -                in->linesize[c], out->linesize[c],
> -                s->coefs[c ? CHROMA_SPATIAL : LUMA_SPATIAL],
> -                s->coefs[c ? CHROMA_TMP     : LUMA_TMP]);
> -    }
> +    td.in = in;
> +    td.out = out;
> +    td.direct = direct;
> +    /* one thread per planar */
> +    ctx->internal->execute(ctx, do_denoise, &td, NULL, 3);
>
>      if (ctx->is_disabled) {
>          av_frame_free(&out);
> @@ -370,5 +394,5 @@ AVFilter ff_vf_hqdn3d = {
>      .query_formats = query_formats,
>      .inputs        = avfilter_vf_hqdn3d_inputs,
>      .outputs       = avfilter_vf_hqdn3d_outputs,
> -    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
> +    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |
> AVFILTER_FLAG_SLICE_THREADS,
>  };
> diff --git a/libavfilter/vf_hqdn3d.h b/libavfilter/vf_hqdn3d.h
> index 03a79a1..3279bbc 100644
> --- a/libavfilter/vf_hqdn3d.h
> +++ b/libavfilter/vf_hqdn3d.h
> @@ -31,7 +31,7 @@
>  typedef struct HQDN3DContext {
>      const AVClass *class;
>      int16_t *coefs[4];
> -    uint16_t *line;
> +    uint16_t *line[3];
>      uint16_t *frame_prev[3];
>      double strength[4];
>      int hsub, vsub;
> --
> 1.7.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index d6c14bb..6dd8fd8 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -223,7 +223,9 @@  static av_cold void uninit(AVFilterContext *ctx)
     av_freep(&s->coefs[1]);
     av_freep(&s->coefs[2]);
     av_freep(&s->coefs[3]);
-    av_freep(&s->line);
+    av_freep(&s->line[0]);
+    av_freep(&s->line[1]);
+    av_freep(&s->line[2]);
     av_freep(&s->frame_prev[0]);
     av_freep(&s->frame_prev[1]);
     av_freep(&s->frame_prev[2]);
@@ -271,9 +273,11 @@  static int config_input(AVFilterLink *inlink)
     s->vsub  = desc->log2_chroma_h;
     s->depth = desc->comp[0].depth;
 
-    s->line = av_malloc_array(inlink->w, sizeof(*s->line));
-    if (!s->line)
-        return AVERROR(ENOMEM);
+    for (i = 0; i < 3; i++) {
+        s->line[i] = av_malloc_array(inlink->w, sizeof(*s->line[i]));
+        if (!s->line[i])
+            return AVERROR(ENOMEM);
+    }
 
     for (i = 0; i < 4; i++) {
         s->coefs[i] = precalc_coefs(s->strength[i], s->depth);
@@ -287,14 +291,38 @@  static int config_input(AVFilterLink *inlink)
     return 0;
 }
 
+struct ThreadData {
+    AVFrame *in, *out;
+    int direct;
+};
+
+static int do_denoise(AVFilterContext *ctx, void *data, int job_nr, int n_jobs)
+{
+    HQDN3DContext *s = ctx->priv;
+    struct ThreadData *td = data;
+    AVFrame *out = td->out;
+    AVFrame *in = td->in;
+    int direct = td->direct;
+
+    denoise(s, in->data[job_nr], out->data[job_nr],
+                s->line[job_nr], &s->frame_prev[job_nr],
+                AV_CEIL_RSHIFT(in->width,  (!!job_nr * s->hsub)),
+                AV_CEIL_RSHIFT(in->height, (!!job_nr * s->vsub)),
+                in->linesize[job_nr], out->linesize[job_nr],
+                s->coefs[job_nr ? CHROMA_SPATIAL : LUMA_SPATIAL],
+                s->coefs[job_nr ? CHROMA_TMP     : LUMA_TMP]);
+
+    return 0;
+}
+
 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 {
     AVFilterContext *ctx  = inlink->dst;
-    HQDN3DContext *s = ctx->priv;
     AVFilterLink *outlink = ctx->outputs[0];
 
     AVFrame *out;
-    int c, direct = av_frame_is_writable(in) && !ctx->is_disabled;
+    int direct = av_frame_is_writable(in) && !ctx->is_disabled;
+    struct ThreadData td;
 
     if (direct) {
         out = in;
@@ -308,15 +336,11 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         av_frame_copy_props(out, in);
     }
 
-    for (c = 0; c < 3; c++) {
-        denoise(s, in->data[c], out->data[c],
-                s->line, &s->frame_prev[c],
-                AV_CEIL_RSHIFT(in->width,  (!!c * s->hsub)),
-                AV_CEIL_RSHIFT(in->height, (!!c * s->vsub)),
-                in->linesize[c], out->linesize[c],
-                s->coefs[c ? CHROMA_SPATIAL : LUMA_SPATIAL],
-                s->coefs[c ? CHROMA_TMP     : LUMA_TMP]);
-    }
+    td.in = in;
+    td.out = out;
+    td.direct = direct;
+    /* one thread per planar */
+    ctx->internal->execute(ctx, do_denoise, &td, NULL, 3);
 
     if (ctx->is_disabled) {
         av_frame_free(&out);
@@ -370,5 +394,5 @@  AVFilter ff_vf_hqdn3d = {
     .query_formats = query_formats,
     .inputs        = avfilter_vf_hqdn3d_inputs,
     .outputs       = avfilter_vf_hqdn3d_outputs,
-    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
+    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | AVFILTER_FLAG_SLICE_THREADS,
 };
diff --git a/libavfilter/vf_hqdn3d.h b/libavfilter/vf_hqdn3d.h
index 03a79a1..3279bbc 100644
--- a/libavfilter/vf_hqdn3d.h
+++ b/libavfilter/vf_hqdn3d.h
@@ -31,7 +31,7 @@ 
 typedef struct HQDN3DContext {
     const AVClass *class;
     int16_t *coefs[4];
-    uint16_t *line;
+    uint16_t *line[3];
     uint16_t *frame_prev[3];
     double strength[4];
     int hsub, vsub;