[FFmpeg-devel,24/24] lavfi/vf_scale: implement slice threading

Message ID	20210531075515.19544-24-anton@khirnov.net
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: Anton Khirnov <anton@khirnov.net> To: ffmpeg-devel@ffmpeg.org Date: Mon, 31 May 2021 09:55:15 +0200 Message-Id: <20210531075515.19544-24-anton@khirnov.net> In-Reply-To: <20210531075515.19544-1-anton@khirnov.net> References: <20210531075515.19544-1-anton@khirnov.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 24/24] lavfi/vf_scale: implement slice threading Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,01/24] sws: remove unnecessary braces \| expand [FFmpeg-devel,01/24] sws: remove unnecessary braces [FFmpeg-devel,02/24] sws: factor out updating the palette [FFmpeg-devel,03/24] sws: reindent after previous commit [FFmpeg-devel,04/24] sws: return an error code on invalid parameters to sws_scale() [FFmpeg-devel,05/24] sws: factor out gamma-correct scaling [FFmpeg-devel,06/24] sws: cosmetics [FFmpeg-devel,07/24] sws: factor out cascaded scaling [FFmpeg-devel,08/24] sws: cosmetics [FFmpeg-devel,09/24] sws: initialize {src, dst}Stride2 consistently with {src, dst}2 [FFmpeg-devel,10/24] sws: group the parameters validity checks together [FFmpeg-devel,11/24] sws: do not reallocate scratch buffers for each slice [FFmpeg-devel,12/24] sws: separate the calls to scaled vs unscaled conversion [FFmpeg-devel,13/24] sws: reset sliceDir at the end of sws_scale() [FFmpeg-devel,14/24] sws: make checking for the start of a new frame more explicit [FFmpeg-devel,15/24] sws: merge handling frame start into a single block [FFmpeg-devel,16/24] sws: simplify setting sliceDir [FFmpeg-devel,17/24] sws: move the early return for zero-sized slices higher up [FFmpeg-devel,18/24] sws: move initializing dither_error higher up [FFmpeg-devel,19/24] sws: move updating the palette higher up [FFmpeg-devel,20/24] sws: add a function for scaling dst slices [FFmpeg-devel,21/24] lavfi/vf_scale: factorize freeing the sws contexts [FFmpeg-devel,22/24] lavfi/vf_scale: pass only the private context to scale_slice() [FFmpeg-devel,23/24] lavfi/vf_scale: forward errors from swscale [FFmpeg-devel,24/24] lavfi/vf_scale: implement slice threading

Context	Check	Description
andriy/x86_make	success	Make finished
andriy/x86_make_fate	success	Make fate finished
andriy/PPC64_make	success	Make finished
andriy/PPC64_make_fate	success	Make fate finished

diff --git a/libavfilter/vf_scale.c b/libavfilter/vf_scale.c index cdd7c4da0d..87317393bd 100644 --- a/libavfilter/vf_scale.c +++ b/libavfilter/vf_scale.c @@ -106,8 +106,16 @@ enum EvalMode { typedef struct ScaleContext { const AVClass *class; - struct SwsContext *sws; ///< software scaler context - struct SwsContext *isws[2]; ///< software scaler context for interlaced material + + /** + * Scaler contexts. + * [0] - progressive + * [1/2] - top/bottom fields + */ + struct SwsContext *(*scalers)[3]; + unsigned int nb_scalers; + int *scaler_res; + AVDictionary *opts; /** @@ -122,6 +130,7 @@ typedef struct ScaleContext { double param[2]; // sws params int hsub, vsub; ///< chroma subsampling + int ohsub, ovsub; ///< output chroma subsampling int slice_y; ///< top of current output slice int input_is_pal; ///< set to 1 if the input format is paletted int output_is_pal; ///< set to 1 if the output format is paletted @@ -153,6 +162,7 @@ typedef struct ScaleContext { int eval_mode; ///< expression evaluation mode + int passthrough; } ScaleContext; const AVFilter ff_vf_scale2ref; @@ -330,13 +340,11 @@ static av_cold int init_dict(AVFilterContext *ctx, AVDictionary **opts) static void scaler_free(ScaleContext *s) { - sws_freeContext(s->sws); - sws_freeContext(s->isws[0]); - sws_freeContext(s->isws[1]); + for (int i = 0; i < s->nb_scalers; i++) + for (int j = 0; j < 3; j++) + sws_freeContext(s->scalers[i][j]); - s->sws = NULL; - s->isws[0] = NULL; - s->isws[1] = NULL; + av_freep(&s->scalers); } static av_cold void uninit(AVFilterContext *ctx) @@ -346,6 +354,7 @@ static av_cold void uninit(AVFilterContext *ctx) av_expr_free(scale->h_pexpr); scale->w_pexpr = scale->h_pexpr = NULL; scaler_free(scale); + av_freep(&scale->scaler_res); av_dict_free(&scale->opts); } @@ -522,19 +531,28 @@ static int config_props(AVFilterLink *outlink) scaler_free(scale); - if (inlink0->w == outlink->w && - inlink0->h == outlink->h && - !scale->out_color_matrix && - scale->in_range == scale->out_range && - inlink0->format == outlink->format) - ; - else { - struct SwsContext **swscs[3] = {&scale->sws, &scale->isws[0], &scale->isws[1]}; - int i; - - for (i = 0; i < 3; i++) { + scale->passthrough = inlink0->w == outlink->w && + inlink0->h == outlink->h && + !scale->out_color_matrix && + scale->in_range == scale->out_range && + inlink0->format == outlink->format; + + if (!scale->passthrough) { + int nb_scalers = ff_filter_get_nb_threads(ctx); + + scale->scalers = av_mallocz_array(nb_scalers, 3 * sizeof(struct SwsContext*)); + if (!scale->scalers) + return AVERROR(ENOMEM); + + ret = av_reallocp_array(&scale->scaler_res, nb_scalers, sizeof(*scale->scaler_res)); + if (ret < 0) + return ret; + + for (int i = 0; i < 3; i++) { + for (int t = 0; t < nb_scalers; t++) { int in_v_chr_pos = scale->in_v_chr_pos, out_v_chr_pos = scale->out_v_chr_pos; - struct SwsContext **s = swscs[i]; + struct SwsContext **s = &scale->scalers[t][i]; + *s = sws_alloc_context(); if (!*s) return AVERROR(ENOMEM); @@ -580,9 +598,29 @@ static int config_props(AVFilterLink *outlink) if ((ret = sws_init_context(*s, NULL, NULL)) < 0) return ret; + + /* do not multithread error-diffusion dithering */ + if (i == 0 && t == 0) { + const AVOption *opt; + int64_t dither; + + av_opt_get_int(*s, "sws_dither", 0, &dither); + opt = av_opt_find2(*s, "ed", "sws_dither", 0, 0, NULL); + if (!opt) + return AVERROR_BUG; + + if (dither == opt->default_val.i64) { + av_log(ctx, AV_LOG_WARNING, "Error-diffusion dithering is " + "used, conversion will be single-threaded.\n"); + nb_scalers = 1; + } + } + } + if (!scale->interlaced) break; } + scale->nb_scalers = nb_scalers; } if (inlink0->sample_aspect_ratio.num){ @@ -625,7 +663,8 @@ static int request_frame_ref(AVFilterLink *outlink) return ff_request_frame(outlink->src->inputs[1]); } -static int scale_slice(ScaleContext *scale, AVFrame *out_buf, AVFrame *cur_pic, struct SwsContext *sws, int y, int h, int mul, int field) +static int scale_slice(ScaleContext *scale, AVFrame *out_buf, AVFrame *cur_pic, struct SwsContext *sws, + int y, int h, int mul, int field, int dst) { const uint8_t *in[4]; uint8_t *out[4]; @@ -633,9 +672,10 @@ static int scale_slice(ScaleContext *scale, AVFrame *out_buf, AVFrame *cur_pic, int i; for (i=0; i<4; i++) { - int vsub= ((i+1)&2) ? scale->vsub : 0; - ptrdiff_t in_offset = ((y>>vsub)+field) * cur_pic->linesize[i]; - ptrdiff_t out_offset = field * out_buf->linesize[i]; + int vsub = ((i+1)&2) ? scale->vsub : 0; + int ovsub = ((i+1)&2) ? scale->ovsub : 0; + ptrdiff_t in_offset = (((y * !dst) >> vsub) + field) * cur_pic->linesize[i]; + ptrdiff_t out_offset = (((y * dst) >> ovsub) + field) * out_buf->linesize[i]; in_stride[i] = cur_pic->linesize[i] * mul; out_stride[i] = out_buf->linesize[i] * mul; in[i] = FF_PTR_ADD(cur_pic->data[i], in_offset); @@ -646,17 +686,57 @@ static int scale_slice(ScaleContext *scale, AVFrame *out_buf, AVFrame *cur_pic, if (scale->output_is_pal) out[1] = out_buf->data[1]; + if (dst) + return sws_scale_dst_slice(sws, in, in_stride, + out, out_stride, y / mul, h); + return sws_scale(sws, in, in_stride, y/mul, h, out,out_stride); } +typedef struct ScaleThreadData { + AVFrame *frame_in; + AVFrame *frame_out; + int scaler_idx; +} ScaleThreadData; + +static int scaler_res(ScaleContext *scale) +{ + for (int i = 0; i < scale->nb_scalers; i++) + if (scale->scaler_res[i] < 0) + return scale->scaler_res[i]; + return 0; +} + +static int scale_job(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +{ + ScaleContext *scale = ctx->priv; + ScaleThreadData *td = arg; + int stride_mul = 1 << (td->scaler_idx > 0); + int first_field = td->scaler_idx == 1; + int picture_height = (td->frame_out->height + first_field) / stride_mul; + int slice_height = FFALIGN(FFMAX((picture_height + nb_jobs - 1) / nb_jobs, 1), + 1 << scale->ovsub); + int slice_start = jobnr * slice_height; + int slice_end = FFMIN((jobnr + 1) * slice_height, picture_height); + + if (slice_start < slice_end) { + scale_slice(scale, td->frame_out, td->frame_in, + scale->scalers[jobnr][td->scaler_idx], slice_start, + slice_end - slice_start, stride_mul, td->scaler_idx == 2, 1); + } + + return 0; +} + static int scale_frame(AVFilterLink *link, AVFrame *in, AVFrame **frame_out) { AVFilterContext *ctx = link->dst; ScaleContext *scale = ctx->priv; AVFilterLink *outlink = ctx->outputs[0]; AVFrame *out; - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(link->format); + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(link->format); + const AVPixFmtDescriptor *odesc = av_pix_fmt_desc_get(outlink->format); char buf[32]; int ret; int in_range; @@ -723,13 +803,15 @@ static int scale_frame(AVFilterLink *link, AVFrame *in, AVFrame **frame_out) } scale: - if (!scale->sws) { + if (!scale->nb_scalers) { *frame_out = in; return 0; } scale->hsub = desc->log2_chroma_w; scale->vsub = desc->log2_chroma_h; + scale->ohsub = odesc->log2_chroma_w; + scale->ovsub = odesc->log2_chroma_h; out = ff_get_video_buffer(outlink, outlink->w, outlink->h); if (!out) { @@ -755,7 +837,7 @@ scale: int in_full, out_full, brightness, contrast, saturation; const int *inv_table, *table; - sws_getColorspaceDetails(scale->sws, (int **)&inv_table, &in_full, + sws_getColorspaceDetails(scale->scalers[0][0], (int **)&inv_table, &in_full, (int **)&table, &out_full, &brightness, &contrast, &saturation); @@ -773,17 +855,14 @@ scale: if (scale->out_range != AVCOL_RANGE_UNSPECIFIED) out_full = (scale->out_range == AVCOL_RANGE_JPEG); - sws_setColorspaceDetails(scale->sws, inv_table, in_full, + for (int i = 0; i < 3; i++) + for (int j = 0; j < scale->nb_scalers; j++) { + if (!scale->scalers[j][i]) + continue; + sws_setColorspaceDetails(scale->scalers[j][i], inv_table, in_full, table, out_full, brightness, contrast, saturation); - if (scale->isws[0]) - sws_setColorspaceDetails(scale->isws[0], inv_table, in_full, - table, out_full, - brightness, contrast, saturation); - if (scale->isws[1]) - sws_setColorspaceDetails(scale->isws[1], inv_table, in_full, - table, out_full, - brightness, contrast, saturation); + } out->color_range = out_full ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; } @@ -793,10 +872,22 @@ scale: (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, INT_MAX); + memset(scale->scaler_res, 0, scale->nb_scalers * sizeof(*scale->scaler_res)); + if (scale->interlaced>0 || (scale->interlaced<0 && in->interlaced_frame)) { - ret = scale_slice(scale, out, in, scale->isws[0], 0, (link->h+1)/2, 2, 0); - if (ret >= 0) - ret = scale_slice(scale, out, in, scale->isws[1], 0, link->h /2, 2, 1); + ScaleThreadData td = { + .scaler_idx = 1, + .frame_in = in, + .frame_out = out, + }; + + ctx->internal->execute(ctx, scale_job, &td, scale->scaler_res, scale->nb_scalers); + + if (scaler_res(scale) >= 0) { + td.scaler_idx = 2; + memset(scale->scaler_res, 0, scale->nb_scalers * sizeof(*scale->scaler_res)); + ctx->internal->execute(ctx, scale_job, &td, scale->scaler_res, scale->nb_scalers); + } } else if (scale->nb_slices) { int i, slice_h, slice_start, slice_end = 0; const int nb_slices = FFMIN(scale->nb_slices, link->h); @@ -804,14 +895,22 @@ scale: slice_start = slice_end; slice_end = (link->h * (i+1)) / nb_slices; slice_h = slice_end - slice_start; - ret = scale_slice(scale, out, in, scale->sws, slice_start, slice_h, 1, 0); + ret = scale_slice(scale, out, in, scale->scalers[0][0], slice_start, slice_h, 1, 0, 0); if (ret < 0) break; } } else { - ret = scale_slice(scale, out, in, scale->sws, 0, link->h, 1, 0); + ScaleThreadData td = { + .scaler_idx = 0, + .frame_in = in, + .frame_out = out, + }; + + ctx->internal->execute(ctx, scale_job, &td, scale->scaler_res, scale->nb_scalers); } + ret = scaler_res(scale); + av_frame_free(&in); if (ret < 0) av_frame_free(frame_out); @@ -984,6 +1083,7 @@ const AVFilter ff_vf_scale = { .inputs = avfilter_vf_scale_inputs, .outputs = avfilter_vf_scale_outputs, .process_command = process_command, + .flags = AVFILTER_FLAG_SLICE_THREADS, }; static const AVClass scale2ref_class = {

[FFmpeg-devel,24/24] lavfi/vf_scale: implement slice threading

Checks

Commit Message

Comments

Patch