diff mbox series

[FFmpeg-devel] libavfilter: zscale performance optimization >4x

Message ID 20220219135809.1447-1-Victoria.Zhislina@intel.com
State New
Headers show
Series [FFmpeg-devel] libavfilter: zscale performance optimization >4x | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_aarch64_jetson success Make finished
andriy/make_fate_aarch64_jetson success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Victoria Zhislina Feb. 19, 2022, 1:58 p.m. UTC
By ffmpeg threading support implementation via frame slicing and doing
zimg_filter_graph_build that used to take 30-60% of each frame processig
only if necessary (some parameters changed)
the performance increase vs original version
in video downscale and color conversion  >4x is seen
on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)

Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
---
 libavfilter/vf_zscale.c | 417 +++++++++++++++++++++++++++-------------
 1 file changed, 288 insertions(+), 129 deletions(-)

Comments

Paul B Mahol Feb. 20, 2022, 6:51 p.m. UTC | #1
On Sat, Feb 19, 2022 at 04:58:09PM +0300, Victoria Zhislina wrote:
> By ffmpeg threading support implementation via frame slicing and doing
> zimg_filter_graph_build that used to take 30-60% of each frame processig
> only if necessary (some parameters changed)
> the performance increase vs original version
> in video downscale and color conversion  >4x is seen
> on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
> 
> Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
> ---
>  libavfilter/vf_zscale.c | 417 +++++++++++++++++++++++++++-------------
>  1 file changed, 288 insertions(+), 129 deletions(-)
> 
> diff --git a/libavfilter/vf_zscale.c b/libavfilter/vf_zscale.c
> index 1288c5efc1..61418d4a4a 100644
> --- a/libavfilter/vf_zscale.c
> +++ b/libavfilter/vf_zscale.c
> @@ -1,6 +1,7 @@
>  /*
>   * Copyright (c) 2015 Paul B Mahol
> - *
> + * 2022 Victoria Zhislina, Intel
> +
>   * This file is part of FFmpeg.
>   *
>   * FFmpeg is free software; you can redistribute it and/or
> @@ -44,6 +45,8 @@
>  #include "libavutil/imgutils.h"
>  
>  #define ZIMG_ALIGNMENT 32
> +#define MIN_TILESIZE 64
> +#define MAX_THREADS 64
>  
>  static const char *const var_names[] = {
>      "in_w",   "iw",
> @@ -113,13 +116,17 @@ typedef struct ZScaleContext {
>  
>      int force_original_aspect_ratio;
>  
> -    void *tmp;
> -    size_t tmp_size;
> +    void *tmp[MAX_THREADS]; //separate for each thread;
> +    int nb_threads;
> +    int slice_h;
>  
>      zimg_image_format src_format, dst_format;
>      zimg_image_format alpha_src_format, alpha_dst_format;
> +    zimg_image_format src_format_tmp, dst_format_tmp;
> +    zimg_image_format alpha_src_format_tmp, alpha_dst_format_tmp;
>      zimg_graph_builder_params alpha_params, params;
> -    zimg_filter_graph *alpha_graph, *graph;
> +    zimg_graph_builder_params alpha_params_tmp, params_tmp;
> +    zimg_filter_graph *alpha_graph[MAX_THREADS], *graph[MAX_THREADS];
>  
>      enum AVColorSpace in_colorspace, out_colorspace;
>      enum AVColorTransferCharacteristic in_trc, out_trc;
> @@ -128,10 +135,36 @@ typedef struct ZScaleContext {
>      enum AVChromaLocation in_chromal, out_chromal;
>  } ZScaleContext;
>  
> +typedef struct ThreadData {
> +    const AVPixFmtDescriptor *desc, *odesc;
> +    AVFrame *in, *out;
> +} ThreadData;
> +
>  static av_cold int init(AVFilterContext *ctx)
>  {
>      ZScaleContext *s = ctx->priv;
>      int ret;
> +    int i;

remove that line above

> +
> +    for (i = 0; i < MAX_THREADS; i++) {

for (int i ...

> +        s->tmp[i] = NULL;
> +        s->graph[i] = NULL;
> +        s->alpha_graph[i] = NULL;
> +    }
> +    zimg_image_format_default(&s->src_format, ZIMG_API_VERSION);
> +    zimg_image_format_default(&s->dst_format, ZIMG_API_VERSION);
> +    zimg_image_format_default(&s->src_format_tmp, ZIMG_API_VERSION);
> +    zimg_image_format_default(&s->dst_format_tmp, ZIMG_API_VERSION);
> +
> +    zimg_image_format_default(&s->alpha_src_format, ZIMG_API_VERSION);
> +    zimg_image_format_default(&s->alpha_dst_format, ZIMG_API_VERSION);
> +    zimg_image_format_default(&s->alpha_src_format_tmp, ZIMG_API_VERSION);
> +    zimg_image_format_default(&s->alpha_dst_format_tmp, ZIMG_API_VERSION);
> +
> +    zimg_graph_builder_params_default(&s->params, ZIMG_API_VERSION);
> +    zimg_graph_builder_params_default(&s->params_tmp, ZIMG_API_VERSION);
> +    zimg_graph_builder_params_default(&s->alpha_params, ZIMG_API_VERSION);
> +    zimg_graph_builder_params_default(&s->alpha_params_tmp, ZIMG_API_VERSION);
>  
>      if (s->size_str && (s->w_expr || s->h_expr)) {
>          av_log(ctx, AV_LOG_ERROR,
> @@ -158,7 +191,6 @@ static av_cold int init(AVFilterContext *ctx)
>          av_opt_set(s, "w", "iw", 0);
>      if (!s->h_expr)
>          av_opt_set(s, "h", "ih", 0);
> -
>      return 0;
>  }
>  
> @@ -471,6 +503,51 @@ static enum AVColorRange convert_range_from_zimg(enum zimg_pixel_range_e color_r
>      return AVCOL_RANGE_UNSPECIFIED;
>  }
>  
> +/* returns 0 if image formats are the same and 1 otherwise */
> +static int compare_zimg_image_formats(zimg_image_format *img_fmt0, zimg_image_format *img_fmt1)
> +{
> +    return ((img_fmt0->chroma_location != img_fmt1->chroma_location) ||
> +#if ZIMG_API_VERSION >= 0x204
> +        (img_fmt0->alpha != img_fmt1->alpha) ||
> +#endif
> +        (img_fmt0->color_family != img_fmt1->color_family) ||
> +        (img_fmt0->color_primaries != img_fmt1->color_primaries) ||
> +        (img_fmt0->depth != img_fmt1->depth) ||
> +        (img_fmt0->field_parity != img_fmt1->field_parity) ||
> +        (img_fmt0->height != img_fmt1->height) ||
> +        (img_fmt0->matrix_coefficients != img_fmt1->matrix_coefficients) ||
> +        (img_fmt0->pixel_range != img_fmt1->pixel_range) ||
> +        (img_fmt0->pixel_type != img_fmt1->pixel_type) ||
> +        (img_fmt0->subsample_h != img_fmt1->subsample_h) ||
> +        (img_fmt0->subsample_w != img_fmt1->subsample_w) ||
> +        (img_fmt0->transfer_characteristics != img_fmt1->transfer_characteristics) ||
> +        (img_fmt0->width != img_fmt1->width));
> +}
> +
> +/* returns 0 if graph builder parameters are the same and 1 otherwise */
> +static int compare_zimg_graph_builder_params(zimg_graph_builder_params *parm0, zimg_graph_builder_params *parm1)
> +{
> +    /* the parameters that could be changed inside a single ffmpeg zscale invocation  are checked only
> +    and NaN values that are default for some params are treated properly*/
> +    int ret = (parm0->allow_approximate_gamma != parm1->allow_approximate_gamma) ||
> +        (parm0->dither_type != parm1->dither_type) ||
> +        (parm0->resample_filter != parm1->resample_filter) ||
> +        (parm0->resample_filter_uv != parm1->resample_filter_uv);
> +
> +    if ((isnan(parm0->nominal_peak_luminance) == 0) || (isnan(parm1->nominal_peak_luminance) == 0))
> +        ret = ret || (parm0->nominal_peak_luminance != parm1->nominal_peak_luminance);
> +    if ((isnan(parm0->filter_param_a) == 0) || (isnan(parm1->filter_param_a) == 0))
> +        ret = ret || (parm0->filter_param_a != parm1->filter_param_a);
> +    if ((isnan(parm0->filter_param_a_uv) == 0) || (isnan(parm1->filter_param_a_uv) == 0))
> +        ret = ret || (parm0->filter_param_a_uv != parm1->filter_param_a_uv);
> +    if ((isnan(parm0->filter_param_b) == 0) || (isnan(parm1->filter_param_b) == 0))
> +        ret = ret || (parm0->filter_param_b != parm1->filter_param_b);
> +    if ((isnan(parm0->filter_param_b_uv) == 0) || (isnan(parm1->filter_param_b_uv) == 0))
> +        ret = ret || (parm0->filter_param_b_uv != parm1->filter_param_b_uv);
> +
> +    return ret;
> +}
> +
>  static void format_init(zimg_image_format *format, AVFrame *frame, const AVPixFmtDescriptor *desc,
>                          int colorspace, int primaries, int transfer, int range, int location)
>  {
> @@ -483,46 +560,94 @@ static void format_init(zimg_image_format *format, AVFrame *frame, const AVPixFm
>      format->color_family = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_COLOR_RGB : ZIMG_COLOR_YUV;
>      format->matrix_coefficients = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_MATRIX_RGB : colorspace == -1 ? convert_matrix(frame->colorspace) : colorspace;
>      format->color_primaries = primaries == -1 ? convert_primaries(frame->color_primaries) : primaries;
> -    format->transfer_characteristics = transfer == - 1 ? convert_trc(frame->color_trc) : transfer;
> +    format->transfer_characteristics = transfer == -1 ? convert_trc(frame->color_trc) : transfer;
>      format->pixel_range = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_RANGE_FULL : range == -1 ? convert_range(frame->color_range) : range;
>      format->chroma_location = location == -1 ? convert_chroma_location(frame->chroma_location) : location;
>  }
>  
> -static int graph_build(zimg_filter_graph **graph, zimg_graph_builder_params *params,
> -                       zimg_image_format *src_format, zimg_image_format *dst_format,
> -                       void **tmp, size_t *tmp_size)
> +static int graphs_build(AVFrame *in, AVFrame *out, const AVPixFmtDescriptor *desc, const AVPixFmtDescriptor *out_desc,
> +                        ZScaleContext *s, int job_nr)
>  {
>      int ret;
>      size_t size;
> +    zimg_image_format src_format;
> +    zimg_image_format dst_format;
> +    zimg_image_format alpha_src_format;
> +    zimg_image_format alpha_dst_format;
> +
> +    src_format = s->src_format;
> +    dst_format = s->dst_format;
> +    /* The input slice is specified through the active_region field,
> +    unlike the output slice.
> +    according to zimg requirements input and output slices should have even dimentions */
> +    src_format.active_region.width = in->width;
> +    src_format.active_region.height = s->slice_h;
> +    src_format.active_region.left = 0;
> +    src_format.active_region.top = job_nr * src_format.active_region.height;
> +    //dst now is the single tile only!!
> +    dst_format.width = out->width;
> +    dst_format.height = ((unsigned int)(out->height / s->nb_threads)) & 0xfffffffe;
> +
> +    //the last slice could differ from the previous ones due to the slices division "tail"
> +    if (job_nr == (s->nb_threads - 1)) {
> +        src_format.active_region.height = src_format.height - src_format.active_region.top;
> +        dst_format.height = out->height - job_nr * dst_format.height;
> +    }
>  
> -    zimg_filter_graph_free(*graph);
> -    *graph = zimg_filter_graph_build(src_format, dst_format, params);
> -    if (!*graph)
> +    if (s->graph[job_nr]) {
> +        zimg_filter_graph_free(s->graph[job_nr]);
> +    }
> +    s->graph[job_nr] = zimg_filter_graph_build(&src_format, &dst_format, &s->params);
> +    if (!s->graph[job_nr])
>          return print_zimg_error(NULL);
>  
> -    ret = zimg_filter_graph_get_tmp_size(*graph, &size);
> +    ret = zimg_filter_graph_get_tmp_size(s->graph[job_nr], &size);
>      if (ret)
>          return print_zimg_error(NULL);
>  
> -    if (size > *tmp_size) {
> -        av_freep(tmp);
> -        *tmp = av_malloc(size);
> -        if (!*tmp)
> -            return AVERROR(ENOMEM);
> -
> -        *tmp_size = size;
> -    }
> +    if (s->tmp[job_nr])

    not needed, remove line above

> +        av_freep(&s->tmp[job_nr]);
> +    s->tmp[job_nr] = av_malloc(size);
> +    if (!s->tmp[job_nr])
> +        return AVERROR(ENOMEM);
> +
> +    if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && out_desc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> +        alpha_src_format = s->alpha_src_format;
> +        alpha_dst_format = s->alpha_dst_format;
> +        /* The input slice is specified through the active_region field, unlike the output slice.
> +        according to zimg requirements input and output slices should have even dimentions */
> +        alpha_src_format.active_region.width = in->width;
> +        alpha_src_format.active_region.height = s->slice_h;
> +        alpha_src_format.active_region.left = 0;
> +        alpha_src_format.active_region.top = job_nr * alpha_src_format.active_region.height;
> +        //dst now is the single tile only!!
> +        alpha_dst_format.width = out->width;
> +        alpha_dst_format.height = ((unsigned int)(out->height / s->nb_threads)) & 0xfffffffe;
> +
> +        //the last slice could differ from the previous ones due to the slices division "tail"
> +        if (job_nr == (s->nb_threads - 1)) {
> +            alpha_src_format.active_region.height = alpha_src_format.height - alpha_src_format.active_region.top;
> +            alpha_dst_format.height = out->height - job_nr * alpha_dst_format.height;
> +        }
>  
> +        if (s->alpha_graph[job_nr]) {
> +            zimg_filter_graph_free(s->alpha_graph[job_nr]);
> +        }
> +        s->alpha_graph[job_nr] = zimg_filter_graph_build(&alpha_src_format, &alpha_dst_format, &s->alpha_params);
> +        if (!s->alpha_graph[job_nr])
> +            return print_zimg_error(NULL);
> +     }
>      return 0;
>  }
>  
>  static int realign_frame(const AVPixFmtDescriptor *desc, AVFrame **frame)
>  {
>      AVFrame *aligned = NULL;
> -    int ret = 0, plane;
> +    int ret = 0, plane, planes;
>  
>      /* Realign any unaligned input frame. */
> -    for (plane = 0; plane < 3; plane++) {
> +    planes = av_pix_fmt_count_planes(desc->nb_components);
> +    for (plane = 0; plane < planes; plane++) {
>          int p = desc->comp[plane].plane;
>          if ((uintptr_t)(*frame)->data[p] % ZIMG_ALIGNMENT || (*frame)->linesize[p] % ZIMG_ALIGNMENT) {
>              if (!(aligned = av_frame_alloc())) {
> @@ -572,20 +697,77 @@ static void update_output_color_information(ZScaleContext *s, AVFrame *frame)
>          frame->chroma_location = (int)s->dst_format.chroma_location + 1;
>  }
>  
> +static int filter_slice(AVFilterContext *ctx, void *data, int job_nr, int n_jobs)
> +{
> +    ThreadData *td = data;
> +    int ret = 0;
> +    int p;
> +    int out_sampl;
> +    int need_gb;
> +    ZScaleContext *s = ctx->priv;
> +    zimg_image_buffer_const src_buf = { ZIMG_API_VERSION };
> +    zimg_image_buffer dst_buf = { ZIMG_API_VERSION };
> +    int  dst_tile_height = ((unsigned int)(td->out->height / n_jobs)) & 0xfffffffe;
> +
> +    /* create zimg filter graphs for each thread
> +     only if not created earlier or there is some change in frame parameters */
> +    need_gb = compare_zimg_image_formats(&s->src_format, &s->src_format_tmp) ||
> +        compare_zimg_image_formats(&s->dst_format, &s->dst_format_tmp) ||
> +        compare_zimg_graph_builder_params(&s->params, &s->params_tmp);
> +    if(td->desc->flags & AV_PIX_FMT_FLAG_ALPHA && td->odesc->flags & AV_PIX_FMT_FLAG_ALPHA)
> +        need_gb = need_gb || compare_zimg_image_formats(&s->alpha_src_format, &s->alpha_src_format_tmp) ||
> +            compare_zimg_image_formats(&s->alpha_dst_format, &s->alpha_dst_format_tmp) ||
> +            compare_zimg_graph_builder_params(&s->alpha_params, &s->alpha_params_tmp);
> +
> +    if (need_gb){
> +        ret = graphs_build(td->in, td->out, td->desc, td->odesc, s, job_nr);
> +        if (ret < 0)
> +            return print_zimg_error(ctx);
> +    }
> +    out_sampl = FFMAX3(td->out->linesize[0], td->out->linesize[1], td->out->linesize[2]);
> +    for (int i = 0; i < 3; i++) {
> +        p = td->desc->comp[i].plane;
> +
> +        src_buf.plane[i].data = td->in->data[p];
> +        src_buf.plane[i].stride = td->in->linesize[p];
> +        src_buf.plane[i].mask = -1;
> +
> +        p = td->odesc->comp[i].plane;
> +        dst_buf.plane[i].data = td->out->data[p] + td->out->linesize[p] * dst_tile_height * td->out->linesize[p] / out_sampl * job_nr;
> +        dst_buf.plane[i].stride = td->out->linesize[p];
> +        dst_buf.plane[i].mask = -1;
> +    }
> +    ret = zimg_filter_graph_process(s->graph[job_nr], &src_buf, &dst_buf, s->tmp[job_nr], 0, 0, 0, 0);
> +    if (ret)
> +        return  print_zimg_error(ctx);
> +
> +    if (td->desc->flags & AV_PIX_FMT_FLAG_ALPHA && td->odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> +        src_buf.plane[0].data = td->in->data[3];
> +        src_buf.plane[0].stride = td->in->linesize[3];
> +        src_buf.plane[0].mask = -1;
> +
> +        dst_buf.plane[0].data = td->out->data[3] + td->out->linesize[3] * dst_tile_height  * job_nr;
> +        dst_buf.plane[0].stride = td->out->linesize[3];
> +        dst_buf.plane[0].mask = -1;
> +
> +        ret = zimg_filter_graph_process(s->alpha_graph[job_nr], &src_buf, &dst_buf, s->tmp[job_nr], 0, 0, 0, 0);
> +        if (ret)
> +            return print_zimg_error(ctx);
> +    }
> +    return 0;
> +}
> +
>  static int filter_frame(AVFilterLink *link, AVFrame *in)
>  {
> -    ZScaleContext *s = link->dst->priv;
> -    AVFilterLink *outlink = link->dst->outputs[0];
> +    AVFilterContext *ctx = link->dst;
> +    ZScaleContext *s = ctx->priv;
> +    AVFilterLink *outlink = ctx->outputs[0];
>      const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(link->format);
>      const AVPixFmtDescriptor *odesc = av_pix_fmt_desc_get(outlink->format);
> -    zimg_image_buffer_const src_buf = { ZIMG_API_VERSION };
> -    zimg_image_buffer dst_buf = { ZIMG_API_VERSION };
>      char buf[32];
> -    int ret = 0, plane;
> +    int ret = 0;
>      AVFrame *out = NULL;
> -
> -    if ((ret = realign_frame(desc, &in)) < 0)
> -        goto fail;
> +    ThreadData td;
>  
>      if (!(out = ff_get_video_buffer(outlink, outlink->w, outlink->h))) {
>          ret =  AVERROR(ENOMEM);
> @@ -596,35 +778,53 @@ static int filter_frame(AVFilterLink *link, AVFrame *in)
>      out->width  = outlink->w;
>      out->height = outlink->h;
>  
> -    if(   in->width  != link->w
> -       || in->height != link->h
> -       || in->format != link->format
> -       || s->in_colorspace != in->colorspace
> -       || s->in_trc  != in->color_trc
> -       || s->in_primaries != in->color_primaries
> -       || s->in_range != in->color_range
> -       || s->out_colorspace != out->colorspace
> -       || s->out_trc  != out->color_trc
> -       || s->out_primaries != out->color_primaries
> -       || s->out_range != out->color_range
> -       || s->in_chromal != in->chroma_location
> -       || s->out_chromal != out->chroma_location) {
> +    //we need to use this filter if something is different for an input and output only
> +    //otherwise - just copy the input frame to the output
> +    if ((link->w != outlink->w) ||
> +        (link->h != outlink->h) ||
> +        (s->src_format.chroma_location != s->dst_format.chroma_location) ||
> +        (s->src_format.color_family !=s->dst_format.color_family) ||
> +        (s->src_format.color_primaries !=s->dst_format.color_primaries) ||
> +        (s->src_format.depth !=s->dst_format.depth) ||
> +        (s->src_format.matrix_coefficients !=s->dst_format.matrix_coefficients) ||
> +        (s->src_format.field_parity !=s->dst_format.field_parity) ||
> +        (s->src_format.pixel_range !=s->dst_format.pixel_range) ||
> +        (s->src_format.pixel_type !=s->dst_format.pixel_type) ||
> +        (s->src_format.transfer_characteristics !=s->dst_format.transfer_characteristics)
> +    ){
> +        if ((ret = realign_frame(desc, &in)) < 0)
> +            goto fail;
> +
>          snprintf(buf, sizeof(buf)-1, "%d", outlink->w);
>          av_opt_set(s, "w", buf, 0);
>          snprintf(buf, sizeof(buf)-1, "%d", outlink->h);
>          av_opt_set(s, "h", buf, 0);
>  
> +

remove this not needed line addition above

>          link->dst->inputs[0]->format = in->format;
>          link->dst->inputs[0]->w      = in->width;
>          link->dst->inputs[0]->h      = in->height;
>  
> -        if ((ret = config_props(outlink)) < 0)
> -            goto fail;
> +        s->nb_threads = FFMIN(ff_filter_get_nb_threads(ctx), link->h / MIN_TILESIZE);
> +        s->slice_h = ((unsigned int)(link->h / s->nb_threads)) & 0xfffffffe; // slice_h should be even for zimg
> +        s->in_colorspace = in->colorspace;
> +        s->in_trc = in->color_trc;
> +        s->in_primaries = in->color_primaries;
> +        s->in_range = in->color_range;
> +        s->out_colorspace = out->colorspace;
> +        s->out_trc = out->color_trc;
> +        s->out_primaries = out->color_primaries;
> +        s->out_range = out->color_range;
>  
>          zimg_image_format_default(&s->src_format, ZIMG_API_VERSION);
>          zimg_image_format_default(&s->dst_format, ZIMG_API_VERSION);
>          zimg_graph_builder_params_default(&s->params, ZIMG_API_VERSION);
>  
> +        format_init(&s->src_format, in, desc, s->colorspace_in,
> +            s->primaries_in, s->trc_in, s->range_in, s->chromal_in);
> +        format_init(&s->dst_format, out, odesc, s->colorspace,
> +            s->primaries, s->trc, s->range, s->chromal);
> +
>          s->params.dither_type = s->dither;
>          s->params.cpu_type = ZIMG_CPU_AUTO;
>          s->params.resample_filter = s->filter;
> @@ -634,27 +834,6 @@ static int filter_frame(AVFilterLink *link, AVFrame *in)
>          s->params.filter_param_a = s->params.filter_param_a_uv = s->param_a;
>          s->params.filter_param_b = s->params.filter_param_b_uv = s->param_b;
>  
> -        format_init(&s->src_format, in, desc, s->colorspace_in,
> -                    s->primaries_in, s->trc_in, s->range_in, s->chromal_in);
> -        format_init(&s->dst_format, out, odesc, s->colorspace,
> -                    s->primaries, s->trc, s->range, s->chromal);
> -
> -        update_output_color_information(s, out);
> -
> -        ret = graph_build(&s->graph, &s->params, &s->src_format, &s->dst_format,
> -                          &s->tmp, &s->tmp_size);
> -        if (ret < 0)
> -            goto fail;
> -
> -        s->in_colorspace  = in->colorspace;
> -        s->in_trc         = in->color_trc;
> -        s->in_primaries   = in->color_primaries;
> -        s->in_range       = in->color_range;
> -        s->out_colorspace = out->colorspace;
> -        s->out_trc        = out->color_trc;
> -        s->out_primaries  = out->color_primaries;
> -        s->out_range      = out->color_range;
> -
>          if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
>              zimg_image_format_default(&s->alpha_src_format, ZIMG_API_VERSION);
>              zimg_image_format_default(&s->alpha_dst_format, ZIMG_API_VERSION);
> @@ -670,76 +849,54 @@ static int filter_frame(AVFilterLink *link, AVFrame *in)
>              s->alpha_src_format.pixel_type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : desc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
>              s->alpha_src_format.color_family = ZIMG_COLOR_GREY;
>  
> -            s->alpha_dst_format.width = out->width;
> -            s->alpha_dst_format.height = out->height;
>              s->alpha_dst_format.depth = odesc->comp[0].depth;
>              s->alpha_dst_format.pixel_type = (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : odesc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
>              s->alpha_dst_format.color_family = ZIMG_COLOR_GREY;
> -
> -            zimg_filter_graph_free(s->alpha_graph);
> -            s->alpha_graph = zimg_filter_graph_build(&s->alpha_src_format, &s->alpha_dst_format, &s->alpha_params);
> -            if (!s->alpha_graph) {
> -                ret = print_zimg_error(link->dst);
> -                goto fail;
> -            }
>          }
> -    }
> -
> -    update_output_color_information(s, out);
> -
> -    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
> -              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
> -              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
> -              INT_MAX);
> -
> -    for (plane = 0; plane < 3; plane++) {
> -        int p = desc->comp[plane].plane;
> -        src_buf.plane[plane].data   = in->data[p];
> -        src_buf.plane[plane].stride = in->linesize[p];
> -        src_buf.plane[plane].mask   = -1;
> -
> -        p = odesc->comp[plane].plane;
> -        dst_buf.plane[plane].data   = out->data[p];
> -        dst_buf.plane[plane].stride = out->linesize[p];
> -        dst_buf.plane[plane].mask   = -1;
> -    }
>  
> -    ret = zimg_filter_graph_process(s->graph, &src_buf, &dst_buf, s->tmp, 0, 0, 0, 0);
> -    if (ret) {
> -        ret = print_zimg_error(link->dst);
> -        goto fail;
> -    }
> +        update_output_color_information(s, out);
> +        av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
> +                  (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
> +                  (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
> +                  INT_MAX);
>  
> -    if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> -        src_buf.plane[0].data   = in->data[3];
> -        src_buf.plane[0].stride = in->linesize[3];
> -        src_buf.plane[0].mask   = -1;
> +        td.in = in;
> +        td.out = out;
> +        td.desc = desc;
> +        td.odesc = odesc;
>  
> -        dst_buf.plane[0].data   = out->data[3];
> -        dst_buf.plane[0].stride = out->linesize[3];
> -        dst_buf.plane[0].mask   = -1;
> +        ff_filter_execute(ctx, filter_slice, &td, NULL, s->nb_threads);
>  
> -        ret = zimg_filter_graph_process(s->alpha_graph, &src_buf, &dst_buf, s->tmp, 0, 0, 0, 0);
> -        if (ret) {
> -            ret = print_zimg_error(link->dst);
> -            goto fail;
> +        s->src_format_tmp = s->src_format;
> +        s->dst_format_tmp = s->dst_format;
> +        s->params_tmp = s->params;
> +        if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> +            s->alpha_src_format_tmp = s->alpha_src_format;
> +            s->alpha_dst_format_tmp = s->alpha_dst_format;
> +            s->alpha_params_tmp = s->alpha_params;
>          }
> -    } else if (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> -        int x, y;
> -
> -        if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) {
> -            for (y = 0; y < out->height; y++) {
> -                for (x = 0; x < out->width; x++) {
> -                    AV_WN32(out->data[3] + x * odesc->comp[3].step + y * out->linesize[3],
> -                            av_float2int(1.0f));
> +
> +        if ((!(desc->flags & AV_PIX_FMT_FLAG_ALPHA)) && (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) ){
> +            int x, y;
> +            if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) {
> +                for (y = 0; y < out->height; y++) {
> +                    for (x = 0; x < out->width; x++) {
> +                        AV_WN32(out->data[3] + x * odesc->comp[3].step + y * out->linesize[3],
> +                                av_float2int(1.0f));
> +                    }
>                  }
> +            } else {
> +                for (y = 0; y < outlink->h; y++)
> +                    memset(out->data[3] + y * out->linesize[3], 0xff, outlink->w);
>              }
> -        } else {
> -            for (y = 0; y < outlink->h; y++)
> -                memset(out->data[3] + y * out->linesize[3], 0xff, outlink->w);
>          }
>      }
> -
> +    else {
> +        /*no need for any filtering */
> +        ret = av_frame_copy(out, in);

You could just return 'in' frame, no need to do full frame copy in nop scenario.

> +        if (ret < 0)
> +            return ret;
> +    }
>  fail:
>      av_frame_free(&in);
>      if (ret) {
> @@ -753,11 +910,12 @@ fail:
>  static av_cold void uninit(AVFilterContext *ctx)
>  {
>      ZScaleContext *s = ctx->priv;
> -
> -    zimg_filter_graph_free(s->graph);
> -    zimg_filter_graph_free(s->alpha_graph);
> -    av_freep(&s->tmp);
> -    s->tmp_size = 0;
> +    int i;
> +    for (i = 0; i < s->nb_threads; i++) {

    for (int i = 0; i < s->nb_threads; i++) {

> +        if (s->tmp[i]) av_freep(&s->tmp[i]);

use just:
           av_freep(&s->tmp[i]);

> +        zimg_filter_graph_free(s->graph[i]);
> +        zimg_filter_graph_free(s->alpha_graph[i]);
> +    }
>  }
>  
>  static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
> @@ -941,4 +1099,5 @@ const AVFilter ff_vf_zscale = {
>      FILTER_OUTPUTS(avfilter_vf_zscale_outputs),
>      FILTER_QUERY_FUNC(query_formats),
>      .process_command = process_command,
> +    .flags           = AVFILTER_FLAG_SLICE_THREADS,
>  };
> -- 
> 2.31.1.windows.1
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lynne Feb. 22, 2022, 5:25 a.m. UTC | #2
19 Feb 2022, 14:58 by niva213@gmail.com:

> By ffmpeg threading support implementation via frame slicing and doing
> zimg_filter_graph_build that used to take 30-60% of each frame processig
> only if necessary (some parameters changed)
> the performance increase vs original version
> in video downscale and color conversion  >4x is seen
> on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
>
> Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
>

Can't you patch such a feature into the upstream instead?
Paul B Mahol Feb. 22, 2022, 8:15 a.m. UTC | #3
On Tue, Feb 22, 2022 at 6:25 AM Lynne <dev@lynne.ee> wrote:

> 19 Feb 2022, 14:58 by niva213@gmail.com:
>
> > By ffmpeg threading support implementation via frame slicing and doing
> > zimg_filter_graph_build that used to take 30-60% of each frame processig
> > only if necessary (some parameters changed)
> > the performance increase vs original version
> > in video downscale and color conversion  >4x is seen
> > on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
> >
> > Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
> >
>
> Can't you patch such a feature into the upstream instead?
>

zscale already have own threading ability, but is very hard to use it, last
time i tried.


> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Paul B Mahol Feb. 22, 2022, 8:16 a.m. UTC | #4
On Tue, Feb 22, 2022 at 9:15 AM Paul B Mahol <onemda@gmail.com> wrote:

>
>
> On Tue, Feb 22, 2022 at 6:25 AM Lynne <dev@lynne.ee> wrote:
>
>> 19 Feb 2022, 14:58 by niva213@gmail.com:
>>
>> > By ffmpeg threading support implementation via frame slicing and doing
>> > zimg_filter_graph_build that used to take 30-60% of each frame processig
>> > only if necessary (some parameters changed)
>> > the performance increase vs original version
>> > in video downscale and color conversion  >4x is seen
>> > on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
>> >
>> > Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
>> >
>>
>> Can't you patch such a feature into the upstream instead?
>>
>
> zscale already have own threading ability, but is very hard to use it,
> last time i tried.
>

I mean zimg.


>
>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
Victoria Zhislina Feb. 22, 2022, 9:34 a.m. UTC | #5
Paul, I've got exactly the same feelings on zing threading usage myself and
decided to go along the standard ffmpeg threading route.
It looks more consistent here.
Megathanks for reviewing my patch, I've fixed everything you've mentioned
and even more.

On Tue, Feb 22, 2022 at 11:15 AM Paul B Mahol <onemda@gmail.com> wrote:

> On Tue, Feb 22, 2022 at 9:15 AM Paul B Mahol <onemda@gmail.com> wrote:
>
> >
> >
> > On Tue, Feb 22, 2022 at 6:25 AM Lynne <dev@lynne.ee> wrote:
> >
> >> 19 Feb 2022, 14:58 by niva213@gmail.com:
> >>
> >> > By ffmpeg threading support implementation via frame slicing and doing
> >> > zimg_filter_graph_build that used to take 30-60% of each frame
> processig
> >> > only if necessary (some parameters changed)
> >> > the performance increase vs original version
> >> > in video downscale and color conversion  >4x is seen
> >> > on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
> >> >
> >> > Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
> >> >
> >>
> >> Can't you patch such a feature into the upstream instead?
> >>
> >
> > zscale already have own threading ability, but is very hard to use it,
> > last time i tried.
> >
>
> I mean zimg.
>
>
> >
> >
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Victoria Zhislina March 10, 2022, 6:41 p.m. UTC | #6
Paul and all, do you have any chances to view my patch from Feb,19? I
assume I'\ve fixed all you've kindly pointed out and even more. Please
correct me if I'm wrong. The only question remaining is - are you ok
with the combination of threading and conditional filter operation (= do
something if it is really required only) or you prefer to split it to 2
separate corresponding patches. I'd prefer the first option because it
makes git ffmpeg repo and ffmpeg development cleaner not dirtier...

On Tue, Feb 22, 2022 at 11:15 AM Paul B Mahol <onemda@gmail.com> wrote:

> On Tue, Feb 22, 2022 at 9:15 AM Paul B Mahol <onemda@gmail.com> wrote:
>
> >
> >
> > On Tue, Feb 22, 2022 at 6:25 AM Lynne <dev@lynne.ee> wrote:
> >
> >> 19 Feb 2022, 14:58 by niva213@gmail.com:
> >>
> >> > By ffmpeg threading support implementation via frame slicing and doing
> >> > zimg_filter_graph_build that used to take 30-60% of each frame
> processig
> >> > only if necessary (some parameters changed)
> >> > the performance increase vs original version
> >> > in video downscale and color conversion  >4x is seen
> >> > on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
> >> >
> >> > Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
> >> >
> >>
> >> Can't you patch such a feature into the upstream instead?
> >>
> >
> > zscale already have own threading ability, but is very hard to use it,
> > last time i tried.
> >
>
> I mean zimg.
>
>
> >
> >
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >>
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Paul B Mahol March 10, 2022, 6:47 p.m. UTC | #7
On Thu, Mar 10, 2022 at 7:41 PM Victoria Zhislina <niva213@gmail.com> wrote:

> Paul and all, do you have any chances to view my patch from Feb,19? I
> assume I'\ve fixed all you've kindly pointed out and even more. Please
> correct me if I'm wrong. The only question remaining is - are you ok
> with the combination of threading and conditional filter operation (= do
> something if it is really required only) or you prefer to split it to 2
> separate corresponding patches. I'd prefer the first option because it
> makes git ffmpeg repo and ffmpeg development cleaner not dirtier...
>

Patch was already applied and some found issues fixed.


>
> On Tue, Feb 22, 2022 at 11:15 AM Paul B Mahol <onemda@gmail.com> wrote:
>
>> On Tue, Feb 22, 2022 at 9:15 AM Paul B Mahol <onemda@gmail.com> wrote:
>>
>> >
>> >
>> > On Tue, Feb 22, 2022 at 6:25 AM Lynne <dev@lynne.ee> wrote:
>> >
>> >> 19 Feb 2022, 14:58 by niva213@gmail.com:
>> >>
>> >> > By ffmpeg threading support implementation via frame slicing and
>> doing
>> >> > zimg_filter_graph_build that used to take 30-60% of each frame
>> processig
>> >> > only if necessary (some parameters changed)
>> >> > the performance increase vs original version
>> >> > in video downscale and color conversion  >4x is seen
>> >> > on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
>> >> >
>> >> > Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
>> >> >
>> >>
>> >> Can't you patch such a feature into the upstream instead?
>> >>
>> >
>> > zscale already have own threading ability, but is very hard to use it,
>> > last time i tried.
>> >
>>
>> I mean zimg.
>>
>>
>> >
>> >
>> >> _______________________________________________
>> >> ffmpeg-devel mailing list
>> >> ffmpeg-devel@ffmpeg.org
>> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> >>
>> >> To unsubscribe, visit link above, or email
>> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>> >>
>> >
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
Victoria Zhislina March 11, 2022, 4:42 p.m. UTC | #8
Awesome, thanks!

On Thu, Mar 10, 2022 at 9:45 PM Paul B Mahol <onemda@gmail.com> wrote:

>
>
> On Thu, Mar 10, 2022 at 7:41 PM Victoria Zhislina <niva213@gmail.com>
> wrote:
>
>> Paul and all, do you have any chances to view my patch from Feb,19? I
>> assume I'\ve fixed all you've kindly pointed out and even more. Please
>> correct me if I'm wrong. The only question remaining is - are you ok
>> with the combination of threading and conditional filter operation (= do
>> something if it is really required only) or you prefer to split it to 2
>> separate corresponding patches. I'd prefer the first option because it
>> makes git ffmpeg repo and ffmpeg development cleaner not dirtier...
>>
>
> Patch was already applied and some found issues fixed.
>
>
>>
>> On Tue, Feb 22, 2022 at 11:15 AM Paul B Mahol <onemda@gmail.com> wrote:
>>
>>> On Tue, Feb 22, 2022 at 9:15 AM Paul B Mahol <onemda@gmail.com> wrote:
>>>
>>> >
>>> >
>>> > On Tue, Feb 22, 2022 at 6:25 AM Lynne <dev@lynne.ee> wrote:
>>> >
>>> >> 19 Feb 2022, 14:58 by niva213@gmail.com:
>>> >>
>>> >> > By ffmpeg threading support implementation via frame slicing and
>>> doing
>>> >> > zimg_filter_graph_build that used to take 30-60% of each frame
>>> processig
>>> >> > only if necessary (some parameters changed)
>>> >> > the performance increase vs original version
>>> >> > in video downscale and color conversion  >4x is seen
>>> >> > on 64 cores Intel Xeon, 3x on i7-6700K (4 cores with HT)
>>> >> >
>>> >> > Signed-off-by: Victoria Zhislina <Victoria.Zhislina@intel.com>
>>> >> >
>>> >>
>>> >> Can't you patch such a feature into the upstream instead?
>>> >>
>>> >
>>> > zscale already have own threading ability, but is very hard to use it,
>>> > last time i tried.
>>> >
>>>
>>> I mean zimg.
>>>
>>>
>>> >
>>> >
>>> >> _______________________________________________
>>> >> ffmpeg-devel mailing list
>>> >> ffmpeg-devel@ffmpeg.org
>>> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>> >>
>>> >> To unsubscribe, visit link above, or email
>>> >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>> >>
>>> >
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>>
>>
diff mbox series

Patch

diff --git a/libavfilter/vf_zscale.c b/libavfilter/vf_zscale.c
index 1288c5efc1..61418d4a4a 100644
--- a/libavfilter/vf_zscale.c
+++ b/libavfilter/vf_zscale.c
@@ -1,6 +1,7 @@ 
 /*
  * Copyright (c) 2015 Paul B Mahol
- *
+ * 2022 Victoria Zhislina, Intel
+
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
@@ -44,6 +45,8 @@ 
 #include "libavutil/imgutils.h"
 
 #define ZIMG_ALIGNMENT 32
+#define MIN_TILESIZE 64
+#define MAX_THREADS 64
 
 static const char *const var_names[] = {
     "in_w",   "iw",
@@ -113,13 +116,17 @@  typedef struct ZScaleContext {
 
     int force_original_aspect_ratio;
 
-    void *tmp;
-    size_t tmp_size;
+    void *tmp[MAX_THREADS]; //separate for each thread;
+    int nb_threads;
+    int slice_h;
 
     zimg_image_format src_format, dst_format;
     zimg_image_format alpha_src_format, alpha_dst_format;
+    zimg_image_format src_format_tmp, dst_format_tmp;
+    zimg_image_format alpha_src_format_tmp, alpha_dst_format_tmp;
     zimg_graph_builder_params alpha_params, params;
-    zimg_filter_graph *alpha_graph, *graph;
+    zimg_graph_builder_params alpha_params_tmp, params_tmp;
+    zimg_filter_graph *alpha_graph[MAX_THREADS], *graph[MAX_THREADS];
 
     enum AVColorSpace in_colorspace, out_colorspace;
     enum AVColorTransferCharacteristic in_trc, out_trc;
@@ -128,10 +135,36 @@  typedef struct ZScaleContext {
     enum AVChromaLocation in_chromal, out_chromal;
 } ZScaleContext;
 
+typedef struct ThreadData {
+    const AVPixFmtDescriptor *desc, *odesc;
+    AVFrame *in, *out;
+} ThreadData;
+
 static av_cold int init(AVFilterContext *ctx)
 {
     ZScaleContext *s = ctx->priv;
     int ret;
+    int i;
+
+    for (i = 0; i < MAX_THREADS; i++) {
+        s->tmp[i] = NULL;
+        s->graph[i] = NULL;
+        s->alpha_graph[i] = NULL;
+    }
+    zimg_image_format_default(&s->src_format, ZIMG_API_VERSION);
+    zimg_image_format_default(&s->dst_format, ZIMG_API_VERSION);
+    zimg_image_format_default(&s->src_format_tmp, ZIMG_API_VERSION);
+    zimg_image_format_default(&s->dst_format_tmp, ZIMG_API_VERSION);
+
+    zimg_image_format_default(&s->alpha_src_format, ZIMG_API_VERSION);
+    zimg_image_format_default(&s->alpha_dst_format, ZIMG_API_VERSION);
+    zimg_image_format_default(&s->alpha_src_format_tmp, ZIMG_API_VERSION);
+    zimg_image_format_default(&s->alpha_dst_format_tmp, ZIMG_API_VERSION);
+
+    zimg_graph_builder_params_default(&s->params, ZIMG_API_VERSION);
+    zimg_graph_builder_params_default(&s->params_tmp, ZIMG_API_VERSION);
+    zimg_graph_builder_params_default(&s->alpha_params, ZIMG_API_VERSION);
+    zimg_graph_builder_params_default(&s->alpha_params_tmp, ZIMG_API_VERSION);
 
     if (s->size_str && (s->w_expr || s->h_expr)) {
         av_log(ctx, AV_LOG_ERROR,
@@ -158,7 +191,6 @@  static av_cold int init(AVFilterContext *ctx)
         av_opt_set(s, "w", "iw", 0);
     if (!s->h_expr)
         av_opt_set(s, "h", "ih", 0);
-
     return 0;
 }
 
@@ -471,6 +503,51 @@  static enum AVColorRange convert_range_from_zimg(enum zimg_pixel_range_e color_r
     return AVCOL_RANGE_UNSPECIFIED;
 }
 
+/* returns 0 if image formats are the same and 1 otherwise */
+static int compare_zimg_image_formats(zimg_image_format *img_fmt0, zimg_image_format *img_fmt1)
+{
+    return ((img_fmt0->chroma_location != img_fmt1->chroma_location) ||
+#if ZIMG_API_VERSION >= 0x204
+        (img_fmt0->alpha != img_fmt1->alpha) ||
+#endif
+        (img_fmt0->color_family != img_fmt1->color_family) ||
+        (img_fmt0->color_primaries != img_fmt1->color_primaries) ||
+        (img_fmt0->depth != img_fmt1->depth) ||
+        (img_fmt0->field_parity != img_fmt1->field_parity) ||
+        (img_fmt0->height != img_fmt1->height) ||
+        (img_fmt0->matrix_coefficients != img_fmt1->matrix_coefficients) ||
+        (img_fmt0->pixel_range != img_fmt1->pixel_range) ||
+        (img_fmt0->pixel_type != img_fmt1->pixel_type) ||
+        (img_fmt0->subsample_h != img_fmt1->subsample_h) ||
+        (img_fmt0->subsample_w != img_fmt1->subsample_w) ||
+        (img_fmt0->transfer_characteristics != img_fmt1->transfer_characteristics) ||
+        (img_fmt0->width != img_fmt1->width));
+}
+
+/* returns 0 if graph builder parameters are the same and 1 otherwise */
+static int compare_zimg_graph_builder_params(zimg_graph_builder_params *parm0, zimg_graph_builder_params *parm1)
+{
+    /* the parameters that could be changed inside a single ffmpeg zscale invocation  are checked only
+    and NaN values that are default for some params are treated properly*/
+    int ret = (parm0->allow_approximate_gamma != parm1->allow_approximate_gamma) ||
+        (parm0->dither_type != parm1->dither_type) ||
+        (parm0->resample_filter != parm1->resample_filter) ||
+        (parm0->resample_filter_uv != parm1->resample_filter_uv);
+
+    if ((isnan(parm0->nominal_peak_luminance) == 0) || (isnan(parm1->nominal_peak_luminance) == 0))
+        ret = ret || (parm0->nominal_peak_luminance != parm1->nominal_peak_luminance);
+    if ((isnan(parm0->filter_param_a) == 0) || (isnan(parm1->filter_param_a) == 0))
+        ret = ret || (parm0->filter_param_a != parm1->filter_param_a);
+    if ((isnan(parm0->filter_param_a_uv) == 0) || (isnan(parm1->filter_param_a_uv) == 0))
+        ret = ret || (parm0->filter_param_a_uv != parm1->filter_param_a_uv);
+    if ((isnan(parm0->filter_param_b) == 0) || (isnan(parm1->filter_param_b) == 0))
+        ret = ret || (parm0->filter_param_b != parm1->filter_param_b);
+    if ((isnan(parm0->filter_param_b_uv) == 0) || (isnan(parm1->filter_param_b_uv) == 0))
+        ret = ret || (parm0->filter_param_b_uv != parm1->filter_param_b_uv);
+
+    return ret;
+}
+
 static void format_init(zimg_image_format *format, AVFrame *frame, const AVPixFmtDescriptor *desc,
                         int colorspace, int primaries, int transfer, int range, int location)
 {
@@ -483,46 +560,94 @@  static void format_init(zimg_image_format *format, AVFrame *frame, const AVPixFm
     format->color_family = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_COLOR_RGB : ZIMG_COLOR_YUV;
     format->matrix_coefficients = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_MATRIX_RGB : colorspace == -1 ? convert_matrix(frame->colorspace) : colorspace;
     format->color_primaries = primaries == -1 ? convert_primaries(frame->color_primaries) : primaries;
-    format->transfer_characteristics = transfer == - 1 ? convert_trc(frame->color_trc) : transfer;
+    format->transfer_characteristics = transfer == -1 ? convert_trc(frame->color_trc) : transfer;
     format->pixel_range = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_RANGE_FULL : range == -1 ? convert_range(frame->color_range) : range;
     format->chroma_location = location == -1 ? convert_chroma_location(frame->chroma_location) : location;
 }
 
-static int graph_build(zimg_filter_graph **graph, zimg_graph_builder_params *params,
-                       zimg_image_format *src_format, zimg_image_format *dst_format,
-                       void **tmp, size_t *tmp_size)
+static int graphs_build(AVFrame *in, AVFrame *out, const AVPixFmtDescriptor *desc, const AVPixFmtDescriptor *out_desc,
+                        ZScaleContext *s, int job_nr)
 {
     int ret;
     size_t size;
+    zimg_image_format src_format;
+    zimg_image_format dst_format;
+    zimg_image_format alpha_src_format;
+    zimg_image_format alpha_dst_format;
+
+    src_format = s->src_format;
+    dst_format = s->dst_format;
+    /* The input slice is specified through the active_region field,
+    unlike the output slice.
+    according to zimg requirements input and output slices should have even dimentions */
+    src_format.active_region.width = in->width;
+    src_format.active_region.height = s->slice_h;
+    src_format.active_region.left = 0;
+    src_format.active_region.top = job_nr * src_format.active_region.height;
+    //dst now is the single tile only!!
+    dst_format.width = out->width;
+    dst_format.height = ((unsigned int)(out->height / s->nb_threads)) & 0xfffffffe;
+
+    //the last slice could differ from the previous ones due to the slices division "tail"
+    if (job_nr == (s->nb_threads - 1)) {
+        src_format.active_region.height = src_format.height - src_format.active_region.top;
+        dst_format.height = out->height - job_nr * dst_format.height;
+    }
 
-    zimg_filter_graph_free(*graph);
-    *graph = zimg_filter_graph_build(src_format, dst_format, params);
-    if (!*graph)
+    if (s->graph[job_nr]) {
+        zimg_filter_graph_free(s->graph[job_nr]);
+    }
+    s->graph[job_nr] = zimg_filter_graph_build(&src_format, &dst_format, &s->params);
+    if (!s->graph[job_nr])
         return print_zimg_error(NULL);
 
-    ret = zimg_filter_graph_get_tmp_size(*graph, &size);
+    ret = zimg_filter_graph_get_tmp_size(s->graph[job_nr], &size);
     if (ret)
         return print_zimg_error(NULL);
 
-    if (size > *tmp_size) {
-        av_freep(tmp);
-        *tmp = av_malloc(size);
-        if (!*tmp)
-            return AVERROR(ENOMEM);
-
-        *tmp_size = size;
-    }
+    if (s->tmp[job_nr])
+        av_freep(&s->tmp[job_nr]);
+    s->tmp[job_nr] = av_malloc(size);
+    if (!s->tmp[job_nr])
+        return AVERROR(ENOMEM);
+
+    if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && out_desc->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        alpha_src_format = s->alpha_src_format;
+        alpha_dst_format = s->alpha_dst_format;
+        /* The input slice is specified through the active_region field, unlike the output slice.
+        according to zimg requirements input and output slices should have even dimentions */
+        alpha_src_format.active_region.width = in->width;
+        alpha_src_format.active_region.height = s->slice_h;
+        alpha_src_format.active_region.left = 0;
+        alpha_src_format.active_region.top = job_nr * alpha_src_format.active_region.height;
+        //dst now is the single tile only!!
+        alpha_dst_format.width = out->width;
+        alpha_dst_format.height = ((unsigned int)(out->height / s->nb_threads)) & 0xfffffffe;
+
+        //the last slice could differ from the previous ones due to the slices division "tail"
+        if (job_nr == (s->nb_threads - 1)) {
+            alpha_src_format.active_region.height = alpha_src_format.height - alpha_src_format.active_region.top;
+            alpha_dst_format.height = out->height - job_nr * alpha_dst_format.height;
+        }
 
+        if (s->alpha_graph[job_nr]) {
+            zimg_filter_graph_free(s->alpha_graph[job_nr]);
+        }
+        s->alpha_graph[job_nr] = zimg_filter_graph_build(&alpha_src_format, &alpha_dst_format, &s->alpha_params);
+        if (!s->alpha_graph[job_nr])
+            return print_zimg_error(NULL);
+     }
     return 0;
 }
 
 static int realign_frame(const AVPixFmtDescriptor *desc, AVFrame **frame)
 {
     AVFrame *aligned = NULL;
-    int ret = 0, plane;
+    int ret = 0, plane, planes;
 
     /* Realign any unaligned input frame. */
-    for (plane = 0; plane < 3; plane++) {
+    planes = av_pix_fmt_count_planes(desc->nb_components);
+    for (plane = 0; plane < planes; plane++) {
         int p = desc->comp[plane].plane;
         if ((uintptr_t)(*frame)->data[p] % ZIMG_ALIGNMENT || (*frame)->linesize[p] % ZIMG_ALIGNMENT) {
             if (!(aligned = av_frame_alloc())) {
@@ -572,20 +697,77 @@  static void update_output_color_information(ZScaleContext *s, AVFrame *frame)
         frame->chroma_location = (int)s->dst_format.chroma_location + 1;
 }
 
+static int filter_slice(AVFilterContext *ctx, void *data, int job_nr, int n_jobs)
+{
+    ThreadData *td = data;
+    int ret = 0;
+    int p;
+    int out_sampl;
+    int need_gb;
+    ZScaleContext *s = ctx->priv;
+    zimg_image_buffer_const src_buf = { ZIMG_API_VERSION };
+    zimg_image_buffer dst_buf = { ZIMG_API_VERSION };
+    int  dst_tile_height = ((unsigned int)(td->out->height / n_jobs)) & 0xfffffffe;
+
+    /* create zimg filter graphs for each thread
+     only if not created earlier or there is some change in frame parameters */
+    need_gb = compare_zimg_image_formats(&s->src_format, &s->src_format_tmp) ||
+        compare_zimg_image_formats(&s->dst_format, &s->dst_format_tmp) ||
+        compare_zimg_graph_builder_params(&s->params, &s->params_tmp);
+    if(td->desc->flags & AV_PIX_FMT_FLAG_ALPHA && td->odesc->flags & AV_PIX_FMT_FLAG_ALPHA)
+        need_gb = need_gb || compare_zimg_image_formats(&s->alpha_src_format, &s->alpha_src_format_tmp) ||
+            compare_zimg_image_formats(&s->alpha_dst_format, &s->alpha_dst_format_tmp) ||
+            compare_zimg_graph_builder_params(&s->alpha_params, &s->alpha_params_tmp);
+
+    if (need_gb){
+        ret = graphs_build(td->in, td->out, td->desc, td->odesc, s, job_nr);
+        if (ret < 0)
+            return print_zimg_error(ctx);
+    }
+    out_sampl = FFMAX3(td->out->linesize[0], td->out->linesize[1], td->out->linesize[2]);
+    for (int i = 0; i < 3; i++) {
+        p = td->desc->comp[i].plane;
+
+        src_buf.plane[i].data = td->in->data[p];
+        src_buf.plane[i].stride = td->in->linesize[p];
+        src_buf.plane[i].mask = -1;
+
+        p = td->odesc->comp[i].plane;
+        dst_buf.plane[i].data = td->out->data[p] + td->out->linesize[p] * dst_tile_height * td->out->linesize[p] / out_sampl * job_nr;
+        dst_buf.plane[i].stride = td->out->linesize[p];
+        dst_buf.plane[i].mask = -1;
+    }
+    ret = zimg_filter_graph_process(s->graph[job_nr], &src_buf, &dst_buf, s->tmp[job_nr], 0, 0, 0, 0);
+    if (ret)
+        return  print_zimg_error(ctx);
+
+    if (td->desc->flags & AV_PIX_FMT_FLAG_ALPHA && td->odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        src_buf.plane[0].data = td->in->data[3];
+        src_buf.plane[0].stride = td->in->linesize[3];
+        src_buf.plane[0].mask = -1;
+
+        dst_buf.plane[0].data = td->out->data[3] + td->out->linesize[3] * dst_tile_height  * job_nr;
+        dst_buf.plane[0].stride = td->out->linesize[3];
+        dst_buf.plane[0].mask = -1;
+
+        ret = zimg_filter_graph_process(s->alpha_graph[job_nr], &src_buf, &dst_buf, s->tmp[job_nr], 0, 0, 0, 0);
+        if (ret)
+            return print_zimg_error(ctx);
+    }
+    return 0;
+}
+
 static int filter_frame(AVFilterLink *link, AVFrame *in)
 {
-    ZScaleContext *s = link->dst->priv;
-    AVFilterLink *outlink = link->dst->outputs[0];
+    AVFilterContext *ctx = link->dst;
+    ZScaleContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(link->format);
     const AVPixFmtDescriptor *odesc = av_pix_fmt_desc_get(outlink->format);
-    zimg_image_buffer_const src_buf = { ZIMG_API_VERSION };
-    zimg_image_buffer dst_buf = { ZIMG_API_VERSION };
     char buf[32];
-    int ret = 0, plane;
+    int ret = 0;
     AVFrame *out = NULL;
-
-    if ((ret = realign_frame(desc, &in)) < 0)
-        goto fail;
+    ThreadData td;
 
     if (!(out = ff_get_video_buffer(outlink, outlink->w, outlink->h))) {
         ret =  AVERROR(ENOMEM);
@@ -596,35 +778,53 @@  static int filter_frame(AVFilterLink *link, AVFrame *in)
     out->width  = outlink->w;
     out->height = outlink->h;
 
-    if(   in->width  != link->w
-       || in->height != link->h
-       || in->format != link->format
-       || s->in_colorspace != in->colorspace
-       || s->in_trc  != in->color_trc
-       || s->in_primaries != in->color_primaries
-       || s->in_range != in->color_range
-       || s->out_colorspace != out->colorspace
-       || s->out_trc  != out->color_trc
-       || s->out_primaries != out->color_primaries
-       || s->out_range != out->color_range
-       || s->in_chromal != in->chroma_location
-       || s->out_chromal != out->chroma_location) {
+    //we need to use this filter if something is different for an input and output only
+    //otherwise - just copy the input frame to the output
+    if ((link->w != outlink->w) ||
+        (link->h != outlink->h) ||
+        (s->src_format.chroma_location != s->dst_format.chroma_location) ||
+        (s->src_format.color_family !=s->dst_format.color_family) ||
+        (s->src_format.color_primaries !=s->dst_format.color_primaries) ||
+        (s->src_format.depth !=s->dst_format.depth) ||
+        (s->src_format.matrix_coefficients !=s->dst_format.matrix_coefficients) ||
+        (s->src_format.field_parity !=s->dst_format.field_parity) ||
+        (s->src_format.pixel_range !=s->dst_format.pixel_range) ||
+        (s->src_format.pixel_type !=s->dst_format.pixel_type) ||
+        (s->src_format.transfer_characteristics !=s->dst_format.transfer_characteristics)
+    ){
+        if ((ret = realign_frame(desc, &in)) < 0)
+            goto fail;
+
         snprintf(buf, sizeof(buf)-1, "%d", outlink->w);
         av_opt_set(s, "w", buf, 0);
         snprintf(buf, sizeof(buf)-1, "%d", outlink->h);
         av_opt_set(s, "h", buf, 0);
 
+
         link->dst->inputs[0]->format = in->format;
         link->dst->inputs[0]->w      = in->width;
         link->dst->inputs[0]->h      = in->height;
 
-        if ((ret = config_props(outlink)) < 0)
-            goto fail;
+        s->nb_threads = FFMIN(ff_filter_get_nb_threads(ctx), link->h / MIN_TILESIZE);
+        s->slice_h = ((unsigned int)(link->h / s->nb_threads)) & 0xfffffffe; // slice_h should be even for zimg
+        s->in_colorspace = in->colorspace;
+        s->in_trc = in->color_trc;
+        s->in_primaries = in->color_primaries;
+        s->in_range = in->color_range;
+        s->out_colorspace = out->colorspace;
+        s->out_trc = out->color_trc;
+        s->out_primaries = out->color_primaries;
+        s->out_range = out->color_range;
 
         zimg_image_format_default(&s->src_format, ZIMG_API_VERSION);
         zimg_image_format_default(&s->dst_format, ZIMG_API_VERSION);
         zimg_graph_builder_params_default(&s->params, ZIMG_API_VERSION);
 
+        format_init(&s->src_format, in, desc, s->colorspace_in,
+            s->primaries_in, s->trc_in, s->range_in, s->chromal_in);
+        format_init(&s->dst_format, out, odesc, s->colorspace,
+            s->primaries, s->trc, s->range, s->chromal);
+
         s->params.dither_type = s->dither;
         s->params.cpu_type = ZIMG_CPU_AUTO;
         s->params.resample_filter = s->filter;
@@ -634,27 +834,6 @@  static int filter_frame(AVFilterLink *link, AVFrame *in)
         s->params.filter_param_a = s->params.filter_param_a_uv = s->param_a;
         s->params.filter_param_b = s->params.filter_param_b_uv = s->param_b;
 
-        format_init(&s->src_format, in, desc, s->colorspace_in,
-                    s->primaries_in, s->trc_in, s->range_in, s->chromal_in);
-        format_init(&s->dst_format, out, odesc, s->colorspace,
-                    s->primaries, s->trc, s->range, s->chromal);
-
-        update_output_color_information(s, out);
-
-        ret = graph_build(&s->graph, &s->params, &s->src_format, &s->dst_format,
-                          &s->tmp, &s->tmp_size);
-        if (ret < 0)
-            goto fail;
-
-        s->in_colorspace  = in->colorspace;
-        s->in_trc         = in->color_trc;
-        s->in_primaries   = in->color_primaries;
-        s->in_range       = in->color_range;
-        s->out_colorspace = out->colorspace;
-        s->out_trc        = out->color_trc;
-        s->out_primaries  = out->color_primaries;
-        s->out_range      = out->color_range;
-
         if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
             zimg_image_format_default(&s->alpha_src_format, ZIMG_API_VERSION);
             zimg_image_format_default(&s->alpha_dst_format, ZIMG_API_VERSION);
@@ -670,76 +849,54 @@  static int filter_frame(AVFilterLink *link, AVFrame *in)
             s->alpha_src_format.pixel_type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : desc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
             s->alpha_src_format.color_family = ZIMG_COLOR_GREY;
 
-            s->alpha_dst_format.width = out->width;
-            s->alpha_dst_format.height = out->height;
             s->alpha_dst_format.depth = odesc->comp[0].depth;
             s->alpha_dst_format.pixel_type = (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : odesc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
             s->alpha_dst_format.color_family = ZIMG_COLOR_GREY;
-
-            zimg_filter_graph_free(s->alpha_graph);
-            s->alpha_graph = zimg_filter_graph_build(&s->alpha_src_format, &s->alpha_dst_format, &s->alpha_params);
-            if (!s->alpha_graph) {
-                ret = print_zimg_error(link->dst);
-                goto fail;
-            }
         }
-    }
-
-    update_output_color_information(s, out);
-
-    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
-              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
-              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
-              INT_MAX);
-
-    for (plane = 0; plane < 3; plane++) {
-        int p = desc->comp[plane].plane;
-        src_buf.plane[plane].data   = in->data[p];
-        src_buf.plane[plane].stride = in->linesize[p];
-        src_buf.plane[plane].mask   = -1;
-
-        p = odesc->comp[plane].plane;
-        dst_buf.plane[plane].data   = out->data[p];
-        dst_buf.plane[plane].stride = out->linesize[p];
-        dst_buf.plane[plane].mask   = -1;
-    }
 
-    ret = zimg_filter_graph_process(s->graph, &src_buf, &dst_buf, s->tmp, 0, 0, 0, 0);
-    if (ret) {
-        ret = print_zimg_error(link->dst);
-        goto fail;
-    }
+        update_output_color_information(s, out);
+        av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
+                  (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
+                  (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
+                  INT_MAX);
 
-    if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
-        src_buf.plane[0].data   = in->data[3];
-        src_buf.plane[0].stride = in->linesize[3];
-        src_buf.plane[0].mask   = -1;
+        td.in = in;
+        td.out = out;
+        td.desc = desc;
+        td.odesc = odesc;
 
-        dst_buf.plane[0].data   = out->data[3];
-        dst_buf.plane[0].stride = out->linesize[3];
-        dst_buf.plane[0].mask   = -1;
+        ff_filter_execute(ctx, filter_slice, &td, NULL, s->nb_threads);
 
-        ret = zimg_filter_graph_process(s->alpha_graph, &src_buf, &dst_buf, s->tmp, 0, 0, 0, 0);
-        if (ret) {
-            ret = print_zimg_error(link->dst);
-            goto fail;
+        s->src_format_tmp = s->src_format;
+        s->dst_format_tmp = s->dst_format;
+        s->params_tmp = s->params;
+        if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
+            s->alpha_src_format_tmp = s->alpha_src_format;
+            s->alpha_dst_format_tmp = s->alpha_dst_format;
+            s->alpha_params_tmp = s->alpha_params;
         }
-    } else if (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
-        int x, y;
-
-        if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) {
-            for (y = 0; y < out->height; y++) {
-                for (x = 0; x < out->width; x++) {
-                    AV_WN32(out->data[3] + x * odesc->comp[3].step + y * out->linesize[3],
-                            av_float2int(1.0f));
+
+        if ((!(desc->flags & AV_PIX_FMT_FLAG_ALPHA)) && (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) ){
+            int x, y;
+            if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) {
+                for (y = 0; y < out->height; y++) {
+                    for (x = 0; x < out->width; x++) {
+                        AV_WN32(out->data[3] + x * odesc->comp[3].step + y * out->linesize[3],
+                                av_float2int(1.0f));
+                    }
                 }
+            } else {
+                for (y = 0; y < outlink->h; y++)
+                    memset(out->data[3] + y * out->linesize[3], 0xff, outlink->w);
             }
-        } else {
-            for (y = 0; y < outlink->h; y++)
-                memset(out->data[3] + y * out->linesize[3], 0xff, outlink->w);
         }
     }
-
+    else {
+        /*no need for any filtering */
+        ret = av_frame_copy(out, in);
+        if (ret < 0)
+            return ret;
+    }
 fail:
     av_frame_free(&in);
     if (ret) {
@@ -753,11 +910,12 @@  fail:
 static av_cold void uninit(AVFilterContext *ctx)
 {
     ZScaleContext *s = ctx->priv;
-
-    zimg_filter_graph_free(s->graph);
-    zimg_filter_graph_free(s->alpha_graph);
-    av_freep(&s->tmp);
-    s->tmp_size = 0;
+    int i;
+    for (i = 0; i < s->nb_threads; i++) {
+        if (s->tmp[i]) av_freep(&s->tmp[i]);
+        zimg_filter_graph_free(s->graph[i]);
+        zimg_filter_graph_free(s->alpha_graph[i]);
+    }
 }
 
 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
@@ -941,4 +1099,5 @@  const AVFilter ff_vf_zscale = {
     FILTER_OUTPUTS(avfilter_vf_zscale_outputs),
     FILTER_QUERY_FUNC(query_formats),
     .process_command = process_command,
+    .flags           = AVFILTER_FLAG_SLICE_THREADS,
 };