diff mbox

[FFmpeg-devel] avfilter/pthread: rewrite implementation

Message ID 20170707022546.6164-1-mfcc64@gmail.com
State Superseded
Headers show

Commit Message

Muhammad Faiz July 7, 2017, 2:25 a.m. UTC
Avoid pthread_cond_broadcast that wakes up all workers. Make each of them
uses distict mutex/cond.

Benchmark using afir with threads=4 and 4096 taps fir:
channels=1:
old:
    2128210 decicycles in afir_execute,       2 runs,      0 skips
    1382927 decicycles in afir_execute,    1024 runs,      0 skips
    1367985 decicycles in afir_execute,   16374 runs,     10 skips
new:
    1011270 decicycles in afir_execute,       2 runs,      0 skips
     939891 decicycles in afir_execute,    1024 runs,      0 skips
     955812 decicycles in afir_execute,   16383 runs,      1 skips

channels=2:
old:
    2801720 decicycles in afir_execute,       2 runs,      0 skips
    1624556 decicycles in afir_execute,    1024 runs,      0 skips
    1722584 decicycles in afir_execute,   16380 runs,      4 skips
new:
    1864780 decicycles in afir_execute,       2 runs,      0 skips
    1307955 decicycles in afir_execute,    1024 runs,      0 skips
    1110917 decicycles in afir_execute,   16384 runs,      0 skips

channels=3:
old:
    3031255 decicycles in afir_execute,       2 runs,      0 skips
    2545295 decicycles in afir_execute,    1024 runs,      0 skips
    2498368 decicycles in afir_execute,   16384 runs,      0 skips
new:
    2213540 decicycles in afir_execute,       2 runs,      0 skips
    2305479 decicycles in afir_execute,    1024 runs,      0 skips
    2001942 decicycles in afir_execute,   16382 runs,      2 skips

channels=4:
old:
    4642510 decicycles in afir_execute,       2 runs,      0 skips
    3356856 decicycles in afir_execute,    1024 runs,      0 skips
    2994766 decicycles in afir_execute,   16382 runs,      2 skips
new:
    3590650 decicycles in afir_execute,       2 runs,      0 skips
    2456035 decicycles in afir_execute,    1024 runs,      0 skips
    2332966 decicycles in afir_execute,   16384 runs,      0 skips

channels=6:
old:
    5057785 decicycles in afir_execute,       2 runs,      0 skips
    4279000 decicycles in afir_execute,    1023 runs,      1 skips
    4102256 decicycles in afir_execute,   16383 runs,      1 skips
new:
    4244160 decicycles in afir_execute,       2 runs,      0 skips
    3851306 decicycles in afir_execute,    1024 runs,      0 skips
    3343221 decicycles in afir_execute,   16384 runs,      0 skips

channels=8:
old:
    4871740 decicycles in afir_execute,       2 runs,      0 skips
    4807337 decicycles in afir_execute,    1023 runs,      1 skips
    4454018 decicycles in afir_execute,   16374 runs,     10 skips
new:
    5055460 decicycles in afir_execute,       2 runs,      0 skips
    4554674 decicycles in afir_execute,    1023 runs,      1 skips
    4433398 decicycles in afir_execute,   16382 runs,      2 skips

Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
---
 libavfilter/pthread.c | 189 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 116 insertions(+), 73 deletions(-)

Comments

Paul B Mahol July 7, 2017, 7:45 a.m. UTC | #1
On 7/7/17, Muhammad Faiz <mfcc64@gmail.com> wrote:
> Avoid pthread_cond_broadcast that wakes up all workers. Make each of them
> uses distict mutex/cond.
>

What about video filters?
Muhammad Faiz July 7, 2017, 9:21 a.m. UTC | #2
On Fri, Jul 7, 2017 at 2:45 PM, Paul B Mahol <onemda@gmail.com> wrote:
> On 7/7/17, Muhammad Faiz <mfcc64@gmail.com> wrote:
>> Avoid pthread_cond_broadcast that wakes up all workers. Make each of them
>> uses distict mutex/cond.
>>
>
> What about video filters?

./ffmpeg -filter_complex_threads 16 -filter_complex
"nullsrc=s=1920x1080, transpose=dir=clock" -f rawvideo -y /dev/null
old:
74242240 decicycles in vf_transpose_execute,       1 runs,      0 skips
69378780 decicycles in vf_transpose_execute,       2 runs,      0 skips
66627910 decicycles in vf_transpose_execute,       4 runs,      0 skips
69064686 decicycles in vf_transpose_execute,       8 runs,      0 skips
67557351 decicycles in vf_transpose_execute,      16 runs,      0 skips
65460021 decicycles in vf_transpose_execute,      32 runs,      0 skips
62830474 decicycles in vf_transpose_execute,      64 runs,      0 skips
65043030 decicycles in vf_transpose_execute,     128 runs,      0 skips
62733184 decicycles in vf_transpose_execute,     256 runs,      0 skips
61353307 decicycles in vf_transpose_execute,     512 runs,      0 skips
60308065 decicycles in vf_transpose_execute,    1024 runs,      0 skips
60829621 decicycles in vf_transpose_execute,    2048 runs,      0 skips
61178596 decicycles in vf_transpose_execute,    4096 runs,      0 skips
60492125 decicycles in vf_transpose_execute,    8192 runs,      0 skips
60141149 decicycles in vf_transpose_execute,   16384 runs,      0 skips
60567245 decicycles in vf_transpose_execute,   32768 runs,      0 skips

new:
53044380 decicycles in vf_transpose_execute,       1 runs,      0 skips
57543460 decicycles in vf_transpose_execute,       2 runs,      0 skips
59047585 decicycles in vf_transpose_execute,       4 runs,      0 skips
59139675 decicycles in vf_transpose_execute,       8 runs,      0 skips
62851108 decicycles in vf_transpose_execute,      16 runs,      0 skips
60888400 decicycles in vf_transpose_execute,      32 runs,      0 skips
59835808 decicycles in vf_transpose_execute,      64 runs,      0 skips
61603125 decicycles in vf_transpose_execute,     128 runs,      0 skips
60237035 decicycles in vf_transpose_execute,     256 runs,      0 skips
60044255 decicycles in vf_transpose_execute,     512 runs,      0 skips
59737751 decicycles in vf_transpose_execute,    1024 runs,      0 skips
58839457 decicycles in vf_transpose_execute,    2048 runs,      0 skips
56007488 decicycles in vf_transpose_execute,    4096 runs,      0 skips
52096972 decicycles in vf_transpose_execute,    8192 runs,      0 skips
50752821 decicycles in vf_transpose_execute,   16384 runs,      0 skips
50119401 decicycles in vf_transpose_execute,   32768 runs,      0 skips
Clément Bœsch July 7, 2017, 9:26 a.m. UTC | #3
On Fri, Jul 07, 2017 at 09:25:46AM +0700, Muhammad Faiz wrote:
> Avoid pthread_cond_broadcast that wakes up all workers. Make each of them
> uses distict mutex/cond.
> 

Please make sure this doesn't make tsan go crazy.

Also, the code is pretty much based on lavc/pthread_slice.c, so you may
want to keep differences low by applying your changes in lavc as well. I'm
sure people will be happy if libavcodec gets faster.

Interesting benchmarks btw :)
Muhammad Faiz July 7, 2017, 2:07 p.m. UTC | #4
On Fri, Jul 7, 2017 at 4:26 PM, Clément Bœsch <u@pkh.me> wrote:
> On Fri, Jul 07, 2017 at 09:25:46AM +0700, Muhammad Faiz wrote:
>> Avoid pthread_cond_broadcast that wakes up all workers. Make each of them
>> uses distict mutex/cond.
>>
>
> Please make sure this doesn't make tsan go crazy.

Fixed. Post new patch.

>
> Also, the code is pretty much based on lavc/pthread_slice.c, so you may
> want to keep differences low by applying your changes in lavc as well. I'm
> sure people will be happy if libavcodec gets faster.

I'm currently looking at it.

>
> Interesting benchmarks btw :)
>
> --
> Clément B.
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
diff mbox

Patch

diff --git a/libavfilter/pthread.c b/libavfilter/pthread.c
index c7a0021..8fb3409 100644
--- a/libavfilter/pthread.c
+++ b/libavfilter/pthread.c
@@ -21,6 +21,7 @@ 
  * Libavfilter multithreading support
  */
 
+#include <stdatomic.h>
 #include "config.h"
 
 #include "libavutil/common.h"
@@ -32,61 +33,75 @@ 
 #include "internal.h"
 #include "thread.h"
 
+typedef struct WorkerContext WorkerContext;
+
 typedef struct ThreadContext {
     AVFilterGraph *graph;
 
-    int nb_threads;
-    pthread_t *workers;
+    int nb_workers;
+    WorkerContext *workers;
     avfilter_action_func *func;
 
     /* per-execute parameters */
     AVFilterContext *ctx;
     void *arg;
     int   *rets;
-    int nb_jobs;
+    unsigned nb_jobs;
 
-    pthread_cond_t last_job_cond;
-    pthread_cond_t current_job_cond;
-    pthread_mutex_t current_job_lock;
-    int current_job;
-    unsigned int current_execute;
+    pthread_mutex_t mutex_done;
+    pthread_cond_t cond_done;
+    atomic_uint current_job;
+    atomic_uint nb_finished_jobs;
     int done;
 } ThreadContext;
 
-static void* attribute_align_arg worker(void *v)
+struct WorkerContext {
+    ThreadContext   *ctx;
+    pthread_t       thread;
+    pthread_mutex_t mutex;
+    pthread_cond_t  cond;
+    int             done;
+};
+
+static unsigned run_jobs(ThreadContext *c)
 {
-    ThreadContext *c = v;
-    int our_job      = c->nb_jobs;
-    int nb_threads   = c->nb_threads;
-    unsigned int last_execute = 0;
-    int ret, self_id;
-
-    pthread_mutex_lock(&c->current_job_lock);
-    self_id = c->current_job++;
-
-    for (;;) {
-        while (our_job >= c->nb_jobs) {
-            if (c->current_job == nb_threads + c->nb_jobs)
-                pthread_cond_signal(&c->last_job_cond);
-
-            while (last_execute == c->current_execute && !c->done)
-                pthread_cond_wait(&c->current_job_cond, &c->current_job_lock);
-            last_execute = c->current_execute;
-            our_job = self_id;
-
-            if (c->done) {
-                pthread_mutex_unlock(&c->current_job_lock);
-                return NULL;
-            }
-        }
-        pthread_mutex_unlock(&c->current_job_lock);
+    unsigned current_job, nb_finished_jobs = 0;
 
-        ret = c->func(c->ctx, c->arg, our_job, c->nb_jobs);
+    while (nb_finished_jobs != c->nb_jobs &&
+           (current_job = atomic_fetch_add_explicit(&c->current_job, 1, memory_order_acq_rel)) < c->nb_jobs) {
+        int ret = c->func(c->ctx, c->arg, current_job, c->nb_jobs);
         if (c->rets)
-            c->rets[our_job % c->nb_jobs] = ret;
+            c->rets[current_job] = ret;
+        nb_finished_jobs = atomic_fetch_add_explicit(&c->nb_finished_jobs, 1, memory_order_acq_rel) + 1;
+    }
+
+    return nb_finished_jobs;
+}
+
+static void* attribute_align_arg worker(void *v)
+{
+    WorkerContext *w = v;
+    ThreadContext *c = w->ctx;
+
+    pthread_mutex_lock(&w->mutex);
+    pthread_cond_signal(&w->cond);
+
+    while (1) {
+        w->done = 1;
+        while (w->done)
+            pthread_cond_wait(&w->cond, &w->mutex);
+
+        if (c->done) {
+            pthread_mutex_unlock(&w->mutex);
+            return NULL;
+        }
 
-        pthread_mutex_lock(&c->current_job_lock);
-        our_job = c->current_job++;
+        if (run_jobs(c) == c->nb_jobs) {
+            pthread_mutex_lock(&c->mutex_done);
+            c->done = 1;
+            pthread_cond_signal(&c->cond_done);
+            pthread_mutex_unlock(&c->mutex_done);
+        }
     }
 }
 
@@ -94,48 +109,66 @@  static void slice_thread_uninit(ThreadContext *c)
 {
     int i;
 
-    pthread_mutex_lock(&c->current_job_lock);
     c->done = 1;
-    pthread_cond_broadcast(&c->current_job_cond);
-    pthread_mutex_unlock(&c->current_job_lock);
+    for (i = 0; i < c->nb_workers; i++) {
+        WorkerContext *w = &c->workers[i];
 
-    for (i = 0; i < c->nb_threads; i++)
-         pthread_join(c->workers[i], NULL);
+        pthread_mutex_lock(&w->mutex);
+        w->done = 0;
+        pthread_cond_signal(&w->cond);
+        pthread_mutex_unlock(&w->mutex);
 
-    pthread_mutex_destroy(&c->current_job_lock);
-    pthread_cond_destroy(&c->current_job_cond);
-    pthread_cond_destroy(&c->last_job_cond);
-    av_freep(&c->workers);
-}
+        pthread_join(w->thread, NULL);
+        pthread_cond_destroy(&w->cond);
+        pthread_mutex_destroy(&w->mutex);
+    }
 
-static void slice_thread_park_workers(ThreadContext *c)
-{
-    while (c->current_job != c->nb_threads + c->nb_jobs)
-        pthread_cond_wait(&c->last_job_cond, &c->current_job_lock);
-    pthread_mutex_unlock(&c->current_job_lock);
+    pthread_cond_destroy(&c->cond_done);
+    pthread_mutex_destroy(&c->mutex_done);
+    av_freep(&c->workers);
 }
 
 static int thread_execute(AVFilterContext *ctx, avfilter_action_func *func,
-                          void *arg, int *ret, int nb_jobs)
+                          void *arg, int *rets, int nb_jobs)
 {
     ThreadContext *c = ctx->graph->internal->thread;
+    int nb_workers, i;
 
     if (nb_jobs <= 0)
         return 0;
 
-    pthread_mutex_lock(&c->current_job_lock);
+    if (nb_jobs == 1) {
+        int ret = func(ctx, arg, 0, 1);
+        if (rets)
+            rets[0] = ret;
+        return 0;
+    }
 
-    c->current_job = c->nb_threads;
+    nb_workers = FFMIN(c->nb_workers, nb_jobs - 1);
+    atomic_store_explicit(&c->current_job, 0, memory_order_relaxed);
+    atomic_store_explicit(&c->nb_finished_jobs, 0, memory_order_relaxed);
     c->nb_jobs     = nb_jobs;
     c->ctx         = ctx;
     c->arg         = arg;
     c->func        = func;
-    c->rets        = ret;
-    c->current_execute++;
+    c->rets        = rets;
+
+    for (i = 0; i < nb_workers; i++) {
+        WorkerContext *w = &c->workers[i];
 
-    pthread_cond_broadcast(&c->current_job_cond);
+        pthread_mutex_lock(&w->mutex);
+        w->done = 0;
+        pthread_cond_signal(&w->cond);
+        pthread_mutex_unlock(&w->mutex);
+    }
 
-    slice_thread_park_workers(c);
+    if (run_jobs(c) != c->nb_jobs) {
+        pthread_mutex_lock(&c->mutex_done);
+        while (!c->done)
+            pthread_cond_wait(&c->cond_done, &c->mutex_done);
+        c->done = 0;
+        pthread_mutex_unlock(&c->mutex_done);
+    }
 
     return 0;
 }
@@ -156,33 +189,43 @@  static int thread_init_internal(ThreadContext *c, int nb_threads)
     if (nb_threads <= 1)
         return 1;
 
-    c->nb_threads = nb_threads;
-    c->workers = av_mallocz_array(sizeof(*c->workers), nb_threads);
+    c->nb_workers = nb_threads - 1;
+    c->workers = av_mallocz_array(sizeof(*c->workers), c->nb_workers);
     if (!c->workers)
         return AVERROR(ENOMEM);
 
-    c->current_job = 0;
+    pthread_mutex_init(&c->mutex_done, NULL);
+    pthread_cond_init(&c->cond_done, NULL);
+    atomic_init(&c->current_job, 0);
+    atomic_init(&c->nb_finished_jobs, 0);
     c->nb_jobs     = 0;
     c->done        = 0;
 
-    pthread_cond_init(&c->current_job_cond, NULL);
-    pthread_cond_init(&c->last_job_cond,    NULL);
+    for (i = 0; i < c->nb_workers; i++) {
+        WorkerContext *w = &c->workers[i];
 
-    pthread_mutex_init(&c->current_job_lock, NULL);
-    pthread_mutex_lock(&c->current_job_lock);
-    for (i = 0; i < nb_threads; i++) {
-        ret = pthread_create(&c->workers[i], NULL, worker, c);
+        w->ctx = c;
+        pthread_mutex_init(&w->mutex, NULL);
+        pthread_cond_init(&w->cond, NULL);
+        pthread_mutex_lock(&w->mutex);
+        w->done = 0;
+        ret = pthread_create(&w->thread, NULL, worker, w);
         if (ret) {
-           pthread_mutex_unlock(&c->current_job_lock);
-           c->nb_threads = i;
+           c->nb_workers = i;
+           pthread_mutex_unlock(&w->mutex);
+           pthread_cond_destroy(&w->cond);
+           pthread_mutex_destroy(&w->mutex);
            slice_thread_uninit(c);
            return AVERROR(ret);
         }
+
+        while (!w->done)
+            pthread_cond_wait(&w->cond, &w->mutex);
+        pthread_mutex_unlock(&w->mutex);
     }
 
-    slice_thread_park_workers(c);
 
-    return c->nb_threads;
+    return c->nb_workers + 1;
 }
 
 int ff_graph_thread_init(AVFilterGraph *graph)