Message ID | 20190516094824.13141-1-ruiling.song@intel.com |
---|---|
State | Accepted |
Commit | 94ceeba9f991ab69b192fa8527be0965de7e254b |
Headers | show |
> -----Original Message----- > From: Song, Ruiling > Sent: Thursday, May 16, 2019 5:48 PM > To: ffmpeg-devel@ffmpeg.org > Cc: Song, Ruiling <ruiling.song@intel.com> > Subject: [PATCH V2] avfilter/vf_unsharp: enable slice threading > > benchmarking with a simple command: > ffmpeg -i 1080p.mp4 -vf unsharp=la=3:ca=3 -an -f null /dev/null > with the patch, the fps increase from 50 to 120 on my local machine (i7- > 6770HQ). > > v2: > make av_image_copy_plane() only copy per-slice content. > > Signed-off-by: Ruiling Song <ruiling.song@intel.com> Ping? Any comments? > --- > libavfilter/unsharp.h | 4 +- > libavfilter/vf_unsharp.c | 102 ++++++++++++++++++++++++++++++--------- > 2 files changed, 81 insertions(+), 25 deletions(-) > > diff --git a/libavfilter/unsharp.h b/libavfilter/unsharp.h > index caff986fc1..a60b30f31a 100644 > --- a/libavfilter/unsharp.h > +++ b/libavfilter/unsharp.h > @@ -37,7 +37,8 @@ typedef struct UnsharpFilterParam { > int steps_y; ///< vertical step count > int scalebits; ///< bits to shift pixel > int32_t halfscale; ///< amount to add to pixel > - uint32_t *sc[MAX_MATRIX_SIZE - 1]; ///< finite state machine storage > + uint32_t *sr; ///< finite state machine storage within a row > + uint32_t **sc; ///< finite state machine storage across rows > } UnsharpFilterParam; > > typedef struct UnsharpContext { > @@ -47,6 +48,7 @@ typedef struct UnsharpContext { > UnsharpFilterParam luma; ///< luma parameters (width, height, amount) > UnsharpFilterParam chroma; ///< chroma parameters (width, height, > amount) > int hsub, vsub; > + int nb_threads; > int opencl; > int (* apply_unsharp)(AVFilterContext *ctx, AVFrame *in, AVFrame *out); > } UnsharpContext; > diff --git a/libavfilter/vf_unsharp.c b/libavfilter/vf_unsharp.c > index 41ccc56942..af05833a5d 100644 > --- a/libavfilter/vf_unsharp.c > +++ b/libavfilter/vf_unsharp.c > @@ -47,15 +47,22 @@ > #include "libavutil/pixdesc.h" > #include "unsharp.h" > > -static void apply_unsharp( uint8_t *dst, int dst_stride, > - const uint8_t *src, int src_stride, > - int width, int height, UnsharpFilterParam *fp) > +typedef struct TheadData { > + UnsharpFilterParam *fp; > + uint8_t *dst; > + const uint8_t *src; > + int dst_stride; > + int src_stride; > + int width; > + int height; > +} ThreadData; > + > +static int unsharp_slice(AVFilterContext *ctx, void *arg, int jobnr, int > nb_jobs) > { > + ThreadData *td = arg; > + UnsharpFilterParam *fp = td->fp; > uint32_t **sc = fp->sc; > - uint32_t sr[MAX_MATRIX_SIZE - 1], tmp1, tmp2; > - > - int32_t res; > - int x, y, z; > + uint32_t *sr = fp->sr; > const uint8_t *src2 = NULL; //silence a warning > const int amount = fp->amount; > const int steps_x = fp->steps_x; > @@ -63,30 +70,54 @@ static void apply_unsharp( uint8_t *dst, int > dst_stride, > const int scalebits = fp->scalebits; > const int32_t halfscale = fp->halfscale; > > + uint8_t *dst = td->dst; > + const uint8_t *src = td->src; > + const int dst_stride = td->dst_stride; > + const int src_stride = td->src_stride; > + const int width = td->width; > + const int height = td->height; > + const int sc_offset = jobnr * 2 * steps_y; > + const int sr_offset = jobnr * (MAX_MATRIX_SIZE - 1); > + const int slice_start = (height * jobnr) / nb_jobs; > + const int slice_end = (height * (jobnr+1)) / nb_jobs; > + > + int32_t res; > + int x, y, z; > + uint32_t tmp1, tmp2; > + > if (!amount) { > - av_image_copy_plane(dst, dst_stride, src, src_stride, width, height); > - return; > + av_image_copy_plane(dst + slice_start * dst_stride, dst_stride, > + src + slice_start * src_stride, src_stride, > + width, slice_end - slice_start); > + return 0; > } > > for (y = 0; y < 2 * steps_y; y++) > - memset(sc[y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); > + memset(sc[sc_offset + y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); > > - for (y = -steps_y; y < height + steps_y; y++) { > + // if this is not the first tile, we start from (slice_start - steps_y), > + // so we can get smooth result at slice boundary > + if (slice_start > steps_y) { > + src += (slice_start - steps_y) * src_stride; > + dst += (slice_start - steps_y) * dst_stride; > + } > + > + for (y = -steps_y + slice_start; y < steps_y + slice_end; y++) { > if (y < height) > src2 = src; > > - memset(sr, 0, sizeof(sr[0]) * (2 * steps_x - 1)); > + memset(sr + sr_offset, 0, sizeof(sr[0]) * (2 * steps_x - 1)); > for (x = -steps_x; x < width + steps_x; x++) { > tmp1 = x <= 0 ? src2[0] : x >= width ? src2[width-1] : src2[x]; > for (z = 0; z < steps_x * 2; z += 2) { > - tmp2 = sr[z + 0] + tmp1; sr[z + 0] = tmp1; > - tmp1 = sr[z + 1] + tmp2; sr[z + 1] = tmp2; > + tmp2 = sr[sr_offset + z + 0] + tmp1; sr[sr_offset + z + 0] = tmp1; > + tmp1 = sr[sr_offset + z + 1] + tmp2; sr[sr_offset + z + 1] = tmp2; > } > for (z = 0; z < steps_y * 2; z += 2) { > - tmp2 = sc[z + 0][x + steps_x] + tmp1; sc[z + 0][x + steps_x] = tmp1; > - tmp1 = sc[z + 1][x + steps_x] + tmp2; sc[z + 1][x + steps_x] = tmp2; > + tmp2 = sc[sc_offset + z + 0][x + steps_x] + tmp1; sc[sc_offset + z + > 0][x + steps_x] = tmp1; > + tmp1 = sc[sc_offset + z + 1][x + steps_x] + tmp2; sc[sc_offset + z + > 1][x + steps_x] = tmp2; > } > - if (x >= steps_x && y >= steps_y) { > + if (x >= steps_x && y >= (steps_y + slice_start)) { > const uint8_t *srx = src - steps_y * src_stride + x - steps_x; > uint8_t *dsx = dst - steps_y * dst_stride + x - steps_x; > > @@ -99,6 +130,7 @@ static void apply_unsharp( uint8_t *dst, int > dst_stride, > src += src_stride; > } > } > + return 0; > } > > static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame > *out) > @@ -107,6 +139,8 @@ static int apply_unsharp_c(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > UnsharpContext *s = ctx->priv; > int i, plane_w[3], plane_h[3]; > UnsharpFilterParam *fp[3]; > + ThreadData td; > + > plane_w[0] = inlink->w; > plane_w[1] = plane_w[2] = AV_CEIL_RSHIFT(inlink->w, s->hsub); > plane_h[0] = inlink->h; > @@ -114,7 +148,14 @@ static int apply_unsharp_c(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > fp[0] = &s->luma; > fp[1] = fp[2] = &s->chroma; > for (i = 0; i < 3; i++) { > - apply_unsharp(out->data[i], out->linesize[i], in->data[i], in->linesize[i], > plane_w[i], plane_h[i], fp[i]); > + td.fp = fp[i]; > + td.dst = out->data[i]; > + td.src = in->data[i]; > + td.width = plane_w[i]; > + td.height = plane_h[i]; > + td.dst_stride = out->linesize[i]; > + td.src_stride = in->linesize[i]; > + ctx->internal->execute(ctx, unsharp_slice, &td, NULL, > FFMIN(plane_h[i], s->nb_threads)); > } > return 0; > } > @@ -163,6 +204,7 @@ static int query_formats(AVFilterContext *ctx) > static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, > const char *effect_type, int width) > { > int z; > + UnsharpContext *s = ctx->priv; > const char *effect = fp->amount == 0 ? "none" : fp->amount < 0 ? "blur" : > "sharpen"; > > if (!(fp->msize_x & fp->msize_y & 1)) { > @@ -175,7 +217,12 @@ static int init_filter_param(AVFilterContext *ctx, > UnsharpFilterParam *fp, const > av_log(ctx, AV_LOG_VERBOSE, "effect:%s type:%s msize_x:%d > msize_y:%d amount:%0.2f\n", > effect, effect_type, fp->msize_x, fp->msize_y, fp->amount / 65535.0); > > - for (z = 0; z < 2 * fp->steps_y; z++) > + fp->sr = av_malloc_array((MAX_MATRIX_SIZE - 1) * s->nb_threads, > sizeof(uint32_t)); > + fp->sc = av_malloc_array(2 * fp->steps_y * s->nb_threads, > sizeof(uint32_t **)); > + if (!fp->sr || !fp->sc) > + return AVERROR(ENOMEM); > + > + for (z = 0; z < 2 * fp->steps_y * s->nb_threads; z++) > if (!(fp->sc[z] = av_malloc_array(width + 2 * fp->steps_x, > sizeof(*(fp->sc[z]))))) > return AVERROR(ENOMEM); > @@ -192,6 +239,11 @@ static int config_props(AVFilterLink *link) > s->hsub = desc->log2_chroma_w; > s->vsub = desc->log2_chroma_h; > > + // ensure (height / nb_threads) > 4 * steps_y, > + // so that we don't have too much overlap between two threads > + s->nb_threads = FFMIN(ff_filter_get_nb_threads(link->dst), > + link->h / (4 * s->luma.steps_y)); > + > ret = init_filter_param(link->dst, &s->luma, "luma", link->w); > if (ret < 0) > return ret; > @@ -202,20 +254,22 @@ static int config_props(AVFilterLink *link) > return 0; > } > > -static void free_filter_param(UnsharpFilterParam *fp) > +static void free_filter_param(UnsharpFilterParam *fp, int nb_threads) > { > int z; > > - for (z = 0; z < 2 * fp->steps_y; z++) > + for (z = 0; z < 2 * fp->steps_y * nb_threads; z++) > av_freep(&fp->sc[z]); > + av_freep(&fp->sc); > + av_freep(&fp->sr); > } > > static av_cold void uninit(AVFilterContext *ctx) > { > UnsharpContext *s = ctx->priv; > > - free_filter_param(&s->luma); > - free_filter_param(&s->chroma); > + free_filter_param(&s->luma, s->nb_threads); > + free_filter_param(&s->chroma, s->nb_threads); > } > > static int filter_frame(AVFilterLink *link, AVFrame *in) > @@ -294,5 +348,5 @@ AVFilter ff_vf_unsharp = { > .query_formats = query_formats, > .inputs = avfilter_vf_unsharp_inputs, > .outputs = avfilter_vf_unsharp_outputs, > - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, > + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | > AVFILTER_FLAG_SLICE_THREADS, > }; > -- > 2.17.1
> -----Original Message----- > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf > Of Song, Ruiling > Sent: Thursday, May 23, 2019 9:26 PM > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH V2] avfilter/vf_unsharp: enable slice > threading > > > -----Original Message----- > > From: Song, Ruiling > > Sent: Thursday, May 16, 2019 5:48 PM > > To: ffmpeg-devel@ffmpeg.org > > Cc: Song, Ruiling <ruiling.song@intel.com> > > Subject: [PATCH V2] avfilter/vf_unsharp: enable slice threading > > > > benchmarking with a simple command: > > ffmpeg -i 1080p.mp4 -vf unsharp=la=3:ca=3 -an -f null /dev/null > > with the patch, the fps increase from 50 to 120 on my local machine (i7- > > 6770HQ). > > > > v2: > > make av_image_copy_plane() only copy per-slice content. > > > > Signed-off-by: Ruiling Song <ruiling.song@intel.com> > Ping? Any comments? Ping? Will apply next week if nobody against. Ruiling
diff --git a/libavfilter/unsharp.h b/libavfilter/unsharp.h index caff986fc1..a60b30f31a 100644 --- a/libavfilter/unsharp.h +++ b/libavfilter/unsharp.h @@ -37,7 +37,8 @@ typedef struct UnsharpFilterParam { int steps_y; ///< vertical step count int scalebits; ///< bits to shift pixel int32_t halfscale; ///< amount to add to pixel - uint32_t *sc[MAX_MATRIX_SIZE - 1]; ///< finite state machine storage + uint32_t *sr; ///< finite state machine storage within a row + uint32_t **sc; ///< finite state machine storage across rows } UnsharpFilterParam; typedef struct UnsharpContext { @@ -47,6 +48,7 @@ typedef struct UnsharpContext { UnsharpFilterParam luma; ///< luma parameters (width, height, amount) UnsharpFilterParam chroma; ///< chroma parameters (width, height, amount) int hsub, vsub; + int nb_threads; int opencl; int (* apply_unsharp)(AVFilterContext *ctx, AVFrame *in, AVFrame *out); } UnsharpContext; diff --git a/libavfilter/vf_unsharp.c b/libavfilter/vf_unsharp.c index 41ccc56942..af05833a5d 100644 --- a/libavfilter/vf_unsharp.c +++ b/libavfilter/vf_unsharp.c @@ -47,15 +47,22 @@ #include "libavutil/pixdesc.h" #include "unsharp.h" -static void apply_unsharp( uint8_t *dst, int dst_stride, - const uint8_t *src, int src_stride, - int width, int height, UnsharpFilterParam *fp) +typedef struct TheadData { + UnsharpFilterParam *fp; + uint8_t *dst; + const uint8_t *src; + int dst_stride; + int src_stride; + int width; + int height; +} ThreadData; + +static int unsharp_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { + ThreadData *td = arg; + UnsharpFilterParam *fp = td->fp; uint32_t **sc = fp->sc; - uint32_t sr[MAX_MATRIX_SIZE - 1], tmp1, tmp2; - - int32_t res; - int x, y, z; + uint32_t *sr = fp->sr; const uint8_t *src2 = NULL; //silence a warning const int amount = fp->amount; const int steps_x = fp->steps_x; @@ -63,30 +70,54 @@ static void apply_unsharp( uint8_t *dst, int dst_stride, const int scalebits = fp->scalebits; const int32_t halfscale = fp->halfscale; + uint8_t *dst = td->dst; + const uint8_t *src = td->src; + const int dst_stride = td->dst_stride; + const int src_stride = td->src_stride; + const int width = td->width; + const int height = td->height; + const int sc_offset = jobnr * 2 * steps_y; + const int sr_offset = jobnr * (MAX_MATRIX_SIZE - 1); + const int slice_start = (height * jobnr) / nb_jobs; + const int slice_end = (height * (jobnr+1)) / nb_jobs; + + int32_t res; + int x, y, z; + uint32_t tmp1, tmp2; + if (!amount) { - av_image_copy_plane(dst, dst_stride, src, src_stride, width, height); - return; + av_image_copy_plane(dst + slice_start * dst_stride, dst_stride, + src + slice_start * src_stride, src_stride, + width, slice_end - slice_start); + return 0; } for (y = 0; y < 2 * steps_y; y++) - memset(sc[y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); + memset(sc[sc_offset + y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); - for (y = -steps_y; y < height + steps_y; y++) { + // if this is not the first tile, we start from (slice_start - steps_y), + // so we can get smooth result at slice boundary + if (slice_start > steps_y) { + src += (slice_start - steps_y) * src_stride; + dst += (slice_start - steps_y) * dst_stride; + } + + for (y = -steps_y + slice_start; y < steps_y + slice_end; y++) { if (y < height) src2 = src; - memset(sr, 0, sizeof(sr[0]) * (2 * steps_x - 1)); + memset(sr + sr_offset, 0, sizeof(sr[0]) * (2 * steps_x - 1)); for (x = -steps_x; x < width + steps_x; x++) { tmp1 = x <= 0 ? src2[0] : x >= width ? src2[width-1] : src2[x]; for (z = 0; z < steps_x * 2; z += 2) { - tmp2 = sr[z + 0] + tmp1; sr[z + 0] = tmp1; - tmp1 = sr[z + 1] + tmp2; sr[z + 1] = tmp2; + tmp2 = sr[sr_offset + z + 0] + tmp1; sr[sr_offset + z + 0] = tmp1; + tmp1 = sr[sr_offset + z + 1] + tmp2; sr[sr_offset + z + 1] = tmp2; } for (z = 0; z < steps_y * 2; z += 2) { - tmp2 = sc[z + 0][x + steps_x] + tmp1; sc[z + 0][x + steps_x] = tmp1; - tmp1 = sc[z + 1][x + steps_x] + tmp2; sc[z + 1][x + steps_x] = tmp2; + tmp2 = sc[sc_offset + z + 0][x + steps_x] + tmp1; sc[sc_offset + z + 0][x + steps_x] = tmp1; + tmp1 = sc[sc_offset + z + 1][x + steps_x] + tmp2; sc[sc_offset + z + 1][x + steps_x] = tmp2; } - if (x >= steps_x && y >= steps_y) { + if (x >= steps_x && y >= (steps_y + slice_start)) { const uint8_t *srx = src - steps_y * src_stride + x - steps_x; uint8_t *dsx = dst - steps_y * dst_stride + x - steps_x; @@ -99,6 +130,7 @@ static void apply_unsharp( uint8_t *dst, int dst_stride, src += src_stride; } } + return 0; } static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) @@ -107,6 +139,8 @@ static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) UnsharpContext *s = ctx->priv; int i, plane_w[3], plane_h[3]; UnsharpFilterParam *fp[3]; + ThreadData td; + plane_w[0] = inlink->w; plane_w[1] = plane_w[2] = AV_CEIL_RSHIFT(inlink->w, s->hsub); plane_h[0] = inlink->h; @@ -114,7 +148,14 @@ static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) fp[0] = &s->luma; fp[1] = fp[2] = &s->chroma; for (i = 0; i < 3; i++) { - apply_unsharp(out->data[i], out->linesize[i], in->data[i], in->linesize[i], plane_w[i], plane_h[i], fp[i]); + td.fp = fp[i]; + td.dst = out->data[i]; + td.src = in->data[i]; + td.width = plane_w[i]; + td.height = plane_h[i]; + td.dst_stride = out->linesize[i]; + td.src_stride = in->linesize[i]; + ctx->internal->execute(ctx, unsharp_slice, &td, NULL, FFMIN(plane_h[i], s->nb_threads)); } return 0; } @@ -163,6 +204,7 @@ static int query_formats(AVFilterContext *ctx) static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, const char *effect_type, int width) { int z; + UnsharpContext *s = ctx->priv; const char *effect = fp->amount == 0 ? "none" : fp->amount < 0 ? "blur" : "sharpen"; if (!(fp->msize_x & fp->msize_y & 1)) { @@ -175,7 +217,12 @@ static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, const av_log(ctx, AV_LOG_VERBOSE, "effect:%s type:%s msize_x:%d msize_y:%d amount:%0.2f\n", effect, effect_type, fp->msize_x, fp->msize_y, fp->amount / 65535.0); - for (z = 0; z < 2 * fp->steps_y; z++) + fp->sr = av_malloc_array((MAX_MATRIX_SIZE - 1) * s->nb_threads, sizeof(uint32_t)); + fp->sc = av_malloc_array(2 * fp->steps_y * s->nb_threads, sizeof(uint32_t **)); + if (!fp->sr || !fp->sc) + return AVERROR(ENOMEM); + + for (z = 0; z < 2 * fp->steps_y * s->nb_threads; z++) if (!(fp->sc[z] = av_malloc_array(width + 2 * fp->steps_x, sizeof(*(fp->sc[z]))))) return AVERROR(ENOMEM); @@ -192,6 +239,11 @@ static int config_props(AVFilterLink *link) s->hsub = desc->log2_chroma_w; s->vsub = desc->log2_chroma_h; + // ensure (height / nb_threads) > 4 * steps_y, + // so that we don't have too much overlap between two threads + s->nb_threads = FFMIN(ff_filter_get_nb_threads(link->dst), + link->h / (4 * s->luma.steps_y)); + ret = init_filter_param(link->dst, &s->luma, "luma", link->w); if (ret < 0) return ret; @@ -202,20 +254,22 @@ static int config_props(AVFilterLink *link) return 0; } -static void free_filter_param(UnsharpFilterParam *fp) +static void free_filter_param(UnsharpFilterParam *fp, int nb_threads) { int z; - for (z = 0; z < 2 * fp->steps_y; z++) + for (z = 0; z < 2 * fp->steps_y * nb_threads; z++) av_freep(&fp->sc[z]); + av_freep(&fp->sc); + av_freep(&fp->sr); } static av_cold void uninit(AVFilterContext *ctx) { UnsharpContext *s = ctx->priv; - free_filter_param(&s->luma); - free_filter_param(&s->chroma); + free_filter_param(&s->luma, s->nb_threads); + free_filter_param(&s->chroma, s->nb_threads); } static int filter_frame(AVFilterLink *link, AVFrame *in) @@ -294,5 +348,5 @@ AVFilter ff_vf_unsharp = { .query_formats = query_formats, .inputs = avfilter_vf_unsharp_inputs, .outputs = avfilter_vf_unsharp_outputs, - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, };
benchmarking with a simple command: ffmpeg -i 1080p.mp4 -vf unsharp=la=3:ca=3 -an -f null /dev/null with the patch, the fps increase from 50 to 120 on my local machine (i7-6770HQ). v2: make av_image_copy_plane() only copy per-slice content. Signed-off-by: Ruiling Song <ruiling.song@intel.com> --- libavfilter/unsharp.h | 4 +- libavfilter/vf_unsharp.c | 102 ++++++++++++++++++++++++++++++--------- 2 files changed, 81 insertions(+), 25 deletions(-)