From patchwork Thu May 9 07:42:50 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ruiling Song X-Patchwork-Id: 13039 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 44EB74442BB for ; Thu, 9 May 2019 09:44:14 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 23A0468A70E; Thu, 9 May 2019 09:44:14 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mga03.intel.com (mga03.intel.com [134.134.136.65]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 6064E68A59E for ; Thu, 9 May 2019 09:44:06 +0300 (EEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga005.jf.intel.com ([10.7.209.41]) by orsmga103.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 08 May 2019 23:44:04 -0700 X-ExtLoop1: 1 Received: from ruiling-nuc.sh.intel.com ([10.239.158.179]) by orsmga005.jf.intel.com with ESMTP; 08 May 2019 23:44:04 -0700 From: Ruiling Song To: ffmpeg-devel@ffmpeg.org Date: Thu, 9 May 2019 15:42:50 +0800 Message-Id: <20190509074250.19545-1-ruiling.song@intel.com> X-Mailer: git-send-email 2.17.1 Subject: [FFmpeg-devel] [PATCH] avfilter/vf_unsharp: enable slice threading X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Ruiling Song MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Signed-off-by: Ruiling Song --- libavfilter/unsharp.h | 4 +- libavfilter/vf_unsharp.c | 98 ++++++++++++++++++++++++++++++---------- 2 files changed, 78 insertions(+), 24 deletions(-) diff --git a/libavfilter/unsharp.h b/libavfilter/unsharp.h index caff986fc1..a60b30f31a 100644 --- a/libavfilter/unsharp.h +++ b/libavfilter/unsharp.h @@ -37,7 +37,8 @@ typedef struct UnsharpFilterParam { int steps_y; ///< vertical step count int scalebits; ///< bits to shift pixel int32_t halfscale; ///< amount to add to pixel - uint32_t *sc[MAX_MATRIX_SIZE - 1]; ///< finite state machine storage + uint32_t *sr; ///< finite state machine storage within a row + uint32_t **sc; ///< finite state machine storage across rows } UnsharpFilterParam; typedef struct UnsharpContext { @@ -47,6 +48,7 @@ typedef struct UnsharpContext { UnsharpFilterParam luma; ///< luma parameters (width, height, amount) UnsharpFilterParam chroma; ///< chroma parameters (width, height, amount) int hsub, vsub; + int nb_threads; int opencl; int (* apply_unsharp)(AVFilterContext *ctx, AVFrame *in, AVFrame *out); } UnsharpContext; diff --git a/libavfilter/vf_unsharp.c b/libavfilter/vf_unsharp.c index 41ccc56942..41c62d101a 100644 --- a/libavfilter/vf_unsharp.c +++ b/libavfilter/vf_unsharp.c @@ -47,15 +47,22 @@ #include "libavutil/pixdesc.h" #include "unsharp.h" -static void apply_unsharp( uint8_t *dst, int dst_stride, - const uint8_t *src, int src_stride, - int width, int height, UnsharpFilterParam *fp) +typedef struct TheadData { + UnsharpFilterParam *fp; + uint8_t *dst; + const uint8_t *src; + int dst_stride; + int src_stride; + int width; + int height; +} ThreadData; + +static int unsharp_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { + ThreadData *td = arg; + UnsharpFilterParam *fp = td->fp; uint32_t **sc = fp->sc; - uint32_t sr[MAX_MATRIX_SIZE - 1], tmp1, tmp2; - - int32_t res; - int x, y, z; + uint32_t *sr = fp->sr; const uint8_t *src2 = NULL; //silence a warning const int amount = fp->amount; const int steps_x = fp->steps_x; @@ -63,30 +70,52 @@ static void apply_unsharp( uint8_t *dst, int dst_stride, const int scalebits = fp->scalebits; const int32_t halfscale = fp->halfscale; + uint8_t *dst = td->dst; + const uint8_t *src = td->src; + const int dst_stride = td->dst_stride; + const int src_stride = td->src_stride; + const int width = td->width; + const int height = td->height; + const int sc_offset = jobnr * 2 * steps_y; + const int sr_offset = jobnr * (MAX_MATRIX_SIZE - 1); + const int slice_start = (height * jobnr) / nb_jobs; + const int slice_end = (height * (jobnr+1)) / nb_jobs; + + int32_t res; + int x, y, z; + uint32_t tmp1, tmp2; + if (!amount) { av_image_copy_plane(dst, dst_stride, src, src_stride, width, height); - return; + return 0; } for (y = 0; y < 2 * steps_y; y++) - memset(sc[y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); + memset(sc[sc_offset + y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); - for (y = -steps_y; y < height + steps_y; y++) { + // if this is not the first tile, we start from (slice_start - steps_y), + // so we can get smooth result at slice boundary + if (slice_start > steps_y) { + src += (slice_start - steps_y) * src_stride; + dst += (slice_start - steps_y) * dst_stride; + } + + for (y = -steps_y + slice_start; y < steps_y + slice_end; y++) { if (y < height) src2 = src; - memset(sr, 0, sizeof(sr[0]) * (2 * steps_x - 1)); + memset(sr + sr_offset, 0, sizeof(sr[0]) * (2 * steps_x - 1)); for (x = -steps_x; x < width + steps_x; x++) { tmp1 = x <= 0 ? src2[0] : x >= width ? src2[width-1] : src2[x]; for (z = 0; z < steps_x * 2; z += 2) { - tmp2 = sr[z + 0] + tmp1; sr[z + 0] = tmp1; - tmp1 = sr[z + 1] + tmp2; sr[z + 1] = tmp2; + tmp2 = sr[sr_offset + z + 0] + tmp1; sr[sr_offset + z + 0] = tmp1; + tmp1 = sr[sr_offset + z + 1] + tmp2; sr[sr_offset + z + 1] = tmp2; } for (z = 0; z < steps_y * 2; z += 2) { - tmp2 = sc[z + 0][x + steps_x] + tmp1; sc[z + 0][x + steps_x] = tmp1; - tmp1 = sc[z + 1][x + steps_x] + tmp2; sc[z + 1][x + steps_x] = tmp2; + tmp2 = sc[sc_offset + z + 0][x + steps_x] + tmp1; sc[sc_offset + z + 0][x + steps_x] = tmp1; + tmp1 = sc[sc_offset + z + 1][x + steps_x] + tmp2; sc[sc_offset + z + 1][x + steps_x] = tmp2; } - if (x >= steps_x && y >= steps_y) { + if (x >= steps_x && y >= (steps_y + slice_start)) { const uint8_t *srx = src - steps_y * src_stride + x - steps_x; uint8_t *dsx = dst - steps_y * dst_stride + x - steps_x; @@ -99,6 +128,7 @@ static void apply_unsharp( uint8_t *dst, int dst_stride, src += src_stride; } } + return 0; } static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) @@ -107,6 +137,8 @@ static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) UnsharpContext *s = ctx->priv; int i, plane_w[3], plane_h[3]; UnsharpFilterParam *fp[3]; + ThreadData td; + plane_w[0] = inlink->w; plane_w[1] = plane_w[2] = AV_CEIL_RSHIFT(inlink->w, s->hsub); plane_h[0] = inlink->h; @@ -114,7 +146,14 @@ static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) fp[0] = &s->luma; fp[1] = fp[2] = &s->chroma; for (i = 0; i < 3; i++) { - apply_unsharp(out->data[i], out->linesize[i], in->data[i], in->linesize[i], plane_w[i], plane_h[i], fp[i]); + td.fp = fp[i]; + td.dst = out->data[i]; + td.src = in->data[i]; + td.width = plane_w[i]; + td.height = plane_h[i]; + td.dst_stride = out->linesize[i]; + td.src_stride = in->linesize[i]; + ctx->internal->execute(ctx, unsharp_slice, &td, NULL, FFMIN(plane_h[i], s->nb_threads)); } return 0; } @@ -163,6 +202,7 @@ static int query_formats(AVFilterContext *ctx) static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, const char *effect_type, int width) { int z; + UnsharpContext *s = ctx->priv; const char *effect = fp->amount == 0 ? "none" : fp->amount < 0 ? "blur" : "sharpen"; if (!(fp->msize_x & fp->msize_y & 1)) { @@ -175,7 +215,12 @@ static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, const av_log(ctx, AV_LOG_VERBOSE, "effect:%s type:%s msize_x:%d msize_y:%d amount:%0.2f\n", effect, effect_type, fp->msize_x, fp->msize_y, fp->amount / 65535.0); - for (z = 0; z < 2 * fp->steps_y; z++) + fp->sr = av_malloc_array((MAX_MATRIX_SIZE - 1) * s->nb_threads, sizeof(uint32_t)); + fp->sc = av_malloc_array(2 * fp->steps_y * s->nb_threads, sizeof(uint32_t **)); + if (!fp->sr || !fp->sc) + return AVERROR(ENOMEM); + + for (z = 0; z < 2 * fp->steps_y * s->nb_threads; z++) if (!(fp->sc[z] = av_malloc_array(width + 2 * fp->steps_x, sizeof(*(fp->sc[z]))))) return AVERROR(ENOMEM); @@ -192,6 +237,11 @@ static int config_props(AVFilterLink *link) s->hsub = desc->log2_chroma_w; s->vsub = desc->log2_chroma_h; + // ensure (height / nb_threads) > 4 * steps_y, + // so that we don't have too much overlap between two threads + s->nb_threads = FFMIN(ff_filter_get_nb_threads(link->dst), + link->h / (4 * s->luma.steps_y)); + ret = init_filter_param(link->dst, &s->luma, "luma", link->w); if (ret < 0) return ret; @@ -202,20 +252,22 @@ static int config_props(AVFilterLink *link) return 0; } -static void free_filter_param(UnsharpFilterParam *fp) +static void free_filter_param(UnsharpFilterParam *fp, int nb_threads) { int z; - for (z = 0; z < 2 * fp->steps_y; z++) + for (z = 0; z < 2 * fp->steps_y * nb_threads; z++) av_freep(&fp->sc[z]); + av_freep(&fp->sc); + av_freep(&fp->sr); } static av_cold void uninit(AVFilterContext *ctx) { UnsharpContext *s = ctx->priv; - free_filter_param(&s->luma); - free_filter_param(&s->chroma); + free_filter_param(&s->luma, s->nb_threads); + free_filter_param(&s->chroma, s->nb_threads); } static int filter_frame(AVFilterLink *link, AVFrame *in) @@ -294,5 +346,5 @@ AVFilter ff_vf_unsharp = { .query_formats = query_formats, .inputs = avfilter_vf_unsharp_inputs, .outputs = avfilter_vf_unsharp_outputs, - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, };