[FFmpeg-devel] Parallelize vf_lut

Submitted by Britt Cyr on Feb. 25, 2019, 8:25 p.m.

Details

Message ID 20190225202530.24526-1-cyr@google.com
State New
Headers show

Commit Message

Britt Cyr Feb. 25, 2019, 8:25 p.m.
---
 libavfilter/vf_lut.c | 106 ++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 36 deletions(-)

Comments

Michael Niedermayer Feb. 27, 2019, 3:28 p.m.
On Mon, Feb 25, 2019 at 03:25:30PM -0500, Britt Cyr wrote:
> ---
>  libavfilter/vf_lut.c | 106 ++++++++++++++++++++++++++++---------------
>  1 file changed, 70 insertions(+), 36 deletions(-)
> 
> diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
> index c815ddc194..14386938be 100644
> --- a/libavfilter/vf_lut.c
> +++ b/libavfilter/vf_lut.c
> @@ -72,6 +72,12 @@ typedef struct LutContext {
>      int negate_alpha; /* only used by negate */
>  } LutContext;
>  
> +typedef struct ThreadData {
> +  AVFrame *in;
> +  AVFrame *out;
> +  AVFilterLink *link;
> +} ThreadData;

indention depth is inconsistant


[...]
> @@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>          const int in_linesize  =  in->linesize[0] / 2;
>          const int out_linesize = out->linesize[0] / 2;
>          const int step = s->step;
> +        const int row_min = jobnr / nb_jobs * h;
> +        const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>          inrow0  = (uint16_t*) in ->data[0];
>          outrow0 = (uint16_t*) out->data[0];
>  
> -        for (i = 0; i < h; i ++) {
> +        for (i = row_min; i < row_max; i ++) {
>              inrow  = inrow0;
>              outrow = outrow0;
>              for (j = 0; j < w; j++) {
> @@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>          const int in_linesize  =  in->linesize[0];
>          const int out_linesize = out->linesize[0];
>          const int step = s->step;
> +        const int row_min = jobnr / nb_jobs * h;
> +        const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>          inrow0  = in ->data[0];
>          outrow0 = out->data[0];
>  
> -        for (i = 0; i < h; i ++) {
> +        for (i = row_min; i < row_max; i ++) {
>              inrow  = inrow0;
>              outrow = outrow0;
>              for (j = 0; j < w; j++) {
> @@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>              const uint16_t *tab = s->lut[plane];
>              const int in_linesize  =  in->linesize[plane] / 2;
>              const int out_linesize = out->linesize[plane] / 2;
> +            const int row_min = jobnr / nb_jobs * h;
> +            const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>              inrow  = (uint16_t *)in ->data[plane];
>              outrow = (uint16_t *)out->data[plane];
>  
> -            for (i = 0; i < h; i++) {
> +            for (i = row_min; i < row_max; i++) {
>                  for (j = 0; j < w; j++) {
>  #if HAVE_BIGENDIAN
>                      outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
> @@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>              const uint16_t *tab = s->lut[plane];
>              const int in_linesize  =  in->linesize[plane];
>              const int out_linesize = out->linesize[plane];
> +            const int row_min = jobnr / nb_jobs * h;
> +            const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>              inrow  = in ->data[plane];
>              outrow = out->data[plane];
>  
> -            for (i = 0; i < h; i++) {
> +            for (i = row_min; i < row_max; i++) {
>                  for (j = 0; j < w; j++)
>                      outrow[j] = tab[inrow[j]];
>                  inrow  += in_linesize;

unrea─║ated to your patch, i just spoted this as it makes it obvious
replicating this code 4 times is a bit ugly


> @@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
>          }
>      }
>  
> -    if (!direct)
> +    return 0;
> +}
> +
> +static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) {
> +    AVFilterContext *ctx = inlink->dst;
> +    AVFilterLink *outlink = ctx->outputs[0];
> +    AVFrame *out;
> +    ThreadData td;
> +
> +    if (av_frame_is_writable(in)) {
> +        out = in;
> +    } else {
> +        out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +        if (!out) {
> +            av_frame_free(&in);
> +            return NULL;
> +        }
> +        av_frame_copy_props(out, in);
> +    }
> +    td.in  = in;
> +    td.out = out;
> +    td.link = inlink;

> +    ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, 1));

how many tasks does this run in parallel and how much faster is it ?

thanks

[...]

Patch hide | download patch | download mbox

diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
index c815ddc194..14386938be 100644
--- a/libavfilter/vf_lut.c
+++ b/libavfilter/vf_lut.c
@@ -72,6 +72,12 @@  typedef struct LutContext {
     int negate_alpha; /* only used by negate */
 } LutContext;
 
+typedef struct ThreadData {
+  AVFrame *in;
+  AVFrame *out;
+  AVFilterLink *link;
+} ThreadData;
+
 #define Y 0
 #define U 1
 #define V 2
@@ -337,26 +343,13 @@  static int config_props(AVFilterLink *inlink)
     return 0;
 }
 
-static int filter_frame(AVFilterLink *inlink, AVFrame *in)
-{
-    AVFilterContext *ctx = inlink->dst;
+static int lookup_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) {
     LutContext *s = ctx->priv;
-    AVFilterLink *outlink = ctx->outputs[0];
-    AVFrame *out;
-    int i, j, plane, direct = 0;
-
-    if (av_frame_is_writable(in)) {
-        direct = 1;
-        out = in;
-    } else {
-        out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-        if (!out) {
-            av_frame_free(&in);
-            return AVERROR(ENOMEM);
-        }
-        av_frame_copy_props(out, in);
-    }
-
+    int i, j, plane = 0;
+    const ThreadData *td = arg;
+    const AVFrame *in  = td->in;
+    AVFrame *out = td->out;
+    const AVFilterLink *inlink = td->link;
     if (s->is_rgb && s->is_16bit && !s->is_planar) {
         /* packed, 16-bit */
         uint16_t *inrow, *outrow, *inrow0, *outrow0;
@@ -366,11 +359,13 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         const int in_linesize  =  in->linesize[0] / 2;
         const int out_linesize = out->linesize[0] / 2;
         const int step = s->step;
+        const int row_min = jobnr / nb_jobs * h;
+        const int row_max = (jobnr + 1) / nb_jobs * h;
 
         inrow0  = (uint16_t*) in ->data[0];
         outrow0 = (uint16_t*) out->data[0];
 
-        for (i = 0; i < h; i ++) {
+        for (i = row_min; i < row_max; i ++) {
             inrow  = inrow0;
             outrow = outrow0;
             for (j = 0; j < w; j++) {
@@ -403,11 +398,13 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         const int in_linesize  =  in->linesize[0];
         const int out_linesize = out->linesize[0];
         const int step = s->step;
+        const int row_min = jobnr / nb_jobs * h;
+        const int row_max = (jobnr + 1) / nb_jobs * h;
 
         inrow0  = in ->data[0];
         outrow0 = out->data[0];
 
-        for (i = 0; i < h; i ++) {
+        for (i = row_min; i < row_max; i ++) {
             inrow  = inrow0;
             outrow = outrow0;
             for (j = 0; j < w; j++) {
@@ -435,11 +432,13 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
             const uint16_t *tab = s->lut[plane];
             const int in_linesize  =  in->linesize[plane] / 2;
             const int out_linesize = out->linesize[plane] / 2;
+            const int row_min = jobnr / nb_jobs * h;
+            const int row_max = (jobnr + 1) / nb_jobs * h;
 
             inrow  = (uint16_t *)in ->data[plane];
             outrow = (uint16_t *)out->data[plane];
 
-            for (i = 0; i < h; i++) {
+            for (i = row_min; i < row_max; i++) {
                 for (j = 0; j < w; j++) {
 #if HAVE_BIGENDIAN
                     outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
@@ -463,11 +462,13 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
             const uint16_t *tab = s->lut[plane];
             const int in_linesize  =  in->linesize[plane];
             const int out_linesize = out->linesize[plane];
+            const int row_min = jobnr / nb_jobs * h;
+            const int row_max = (jobnr + 1) / nb_jobs * h;
 
             inrow  = in ->data[plane];
             outrow = out->data[plane];
 
-            for (i = 0; i < h; i++) {
+            for (i = row_min; i < row_max; i++) {
                 for (j = 0; j < w; j++)
                     outrow[j] = tab[inrow[j]];
                 inrow  += in_linesize;
@@ -476,9 +477,42 @@  static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         }
     }
 
-    if (!direct)
+    return 0;
+}
+
+static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) {
+    AVFilterContext *ctx = inlink->dst;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *out;
+    ThreadData td;
+
+    if (av_frame_is_writable(in)) {
+        out = in;
+    } else {
+        out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+        if (!out) {
+            av_frame_free(&in);
+            return NULL;
+        }
+        av_frame_copy_props(out, in);
+    }
+    td.in  = in;
+    td.out = out;
+    td.link = inlink;
+    ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, 1));
+
+    if (out != in)
         av_frame_free(&in);
 
+    return out;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterLink *outlink = inlink->dst->outputs[0];
+    AVFrame *out = apply_lut(inlink, in);
+    if (!out)
+        return AVERROR(ENOMEM);
     return ff_filter_frame(outlink, out);
 }
 
@@ -497,18 +531,18 @@  static const AVFilterPad outputs[] = {
     { NULL }
 };
 
-#define DEFINE_LUT_FILTER(name_, description_)                          \
-    AVFilter ff_vf_##name_ = {                                          \
-        .name          = #name_,                                        \
-        .description   = NULL_IF_CONFIG_SMALL(description_),            \
-        .priv_size     = sizeof(LutContext),                            \
-        .priv_class    = &name_ ## _class,                              \
-        .init          = name_##_init,                                  \
-        .uninit        = uninit,                                        \
-        .query_formats = query_formats,                                 \
-        .inputs        = inputs,                                        \
-        .outputs       = outputs,                                       \
-        .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,        \
+#define DEFINE_LUT_FILTER(name_, description_)                                                  \
+    AVFilter ff_vf_##name_ = {                                                                  \
+        .name          = #name_,                                                                \
+        .description   = NULL_IF_CONFIG_SMALL(description_),                                    \
+        .priv_size     = sizeof(LutContext),                                                    \
+        .priv_class    = &name_ ## _class,                                                      \
+        .init          = name_##_init,                                                          \
+        .uninit        = uninit,                                                                \
+        .query_formats = query_formats,                                                         \
+        .inputs        = inputs,                                                                \
+        .outputs       = outputs,                                                               \
+        .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC |  AVFILTER_FLAG_SLICE_THREADS, \
     }
 
 #if CONFIG_LUT_FILTER