diff mbox series

[FFmpeg-devel,v2,3/3] avcodec/vvc: simplify priority logical to improve performance for 4K/8K

Message ID 20241001065558.56890-3-nuomi2021@gmail.com
State New
Headers show
Series [FFmpeg-devel,v2,1/3] avcodec: make a local copy of executor | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Nuo Mi Oct. 1, 2024, 6:55 a.m. UTC
For 4K/8K video processing, it's possible to have over 1,000 tasks pending on the executor.
In such cases, O(n) and O(log(n)) insertion times are too costly.
Reducing this to O(1) will significantly decrease the time spent in critical sections

clip                                                        | before | after  | delta
------------------------------------------------------------|--------|--------|-------
VVC_HDR_UHDTV2_OpenGOP_7680x4320_50fps_HLG10.bit            |    24  |   27   |  12.5%
VVC_HDR_UHDTV2_OpenGOP_7680x4320_50fps_HLG10_HighBitrate.bit|    12  |   17   |  41.7%
tears_of_steel_4k_8M_8bit_2000.vvc                          |    34  |  102   | 200.0%
VVC_UHDTV1_OpenGOP_3840x2160_60fps_HLG10.bit                |   126  |  128   |   1.6%
RitualDance_1920x1080_60_10_420_37_RA.266                   |   350  |  378   |   8.0%
NovosobornayaSquare_1920x1080.bin                           |   341  |  369   |   8.2%
Tango2_3840x2160_60_10_420_27_LD.266                        |    69  |   70   |   1.4%
RitualDance_1920x1080_60_10_420_32_LD.266                   |   243  |  259   |   6.6%
Chimera_8bit_1080P_1000_frames.vvc                          |   420  |  392   |  -6.7%
BQTerrace_1920x1080_60_10_420_22_RA.vvc                     |   148  |  144   |  -2.7%
---
 libavcodec/executor.c   | 54 ++++++++++++++++++++++++++---------------
 libavcodec/executor.h   |  5 ++--
 libavcodec/vvc/thread.c | 48 +++++++++++++++---------------------
 3 files changed, 57 insertions(+), 50 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/executor.c b/libavcodec/executor.c
index 84d52e7e3b..362e961c4f 100644
--- a/libavcodec/executor.c
+++ b/libavcodec/executor.c
@@ -48,6 +48,11 @@  typedef struct ThreadInfo {
     ExecutorThread thread;
 } ThreadInfo;
 
+typedef struct Queue {
+    AVTask *head;
+    AVTask *tail;
+} Queue;
+
 struct AVExecutor {
     AVTaskCallbacks cb;
     int thread_count;
@@ -60,29 +65,41 @@  struct AVExecutor {
     AVCond cond;
     int die;
 
-    AVTask *tasks;
+    Queue *q;
 };
 
-static AVTask* remove_task(AVTask **prev, AVTask *t)
+static AVTask* remove_task(Queue *q)
 {
-    *prev  = t->next;
-    t->next = NULL;
+    AVTask *t = q->head;
+    if (t) {
+        q->head = t->next;
+        t->next = NULL;
+        if (!q->head)
+            q->tail = NULL;
+    }
     return t;
 }
 
-static void add_task(AVTask **prev, AVTask *t)
+static void add_task(Queue *q, AVTask *t)
 {
-    t->next = *prev;
-    *prev   = t;
+    t->next = NULL;
+    if (!q->head) {
+        q->tail = q->head = t;
+    } else {
+        q->tail->next = t;
+        q->tail = t;
+    }
 }
 
 static int run_one_task(AVExecutor *e, void *lc)
 {
     AVTaskCallbacks *cb = &e->cb;
-    AVTask **prev = &e->tasks;
+    AVTask *t = NULL;
 
-    if (*prev) {
-        AVTask *t = remove_task(prev, *prev);
+    for (int i = 0; i < e->cb.priorities && !t; i++)
+        t = remove_task(e->q + i);
+
+    if (t) {
         if (e->thread_count > 0)
             ff_mutex_unlock(&e->lock);
         cb->run(t, lc, cb->user_data);
@@ -132,6 +149,7 @@  static void executor_free(AVExecutor *e, const int has_lock, const int has_cond)
         ff_mutex_destroy(&e->lock);
 
     av_free(e->threads);
+    av_free(e->q);
     av_free(e->local_contexts);
 
     av_free(e);
@@ -141,7 +159,7 @@  AVExecutor* av_executor_alloc(const AVTaskCallbacks *cb, int thread_count)
 {
     AVExecutor *e;
     int has_lock = 0, has_cond = 0;
-    if (!cb || !cb->user_data || !cb->run || !cb->priority_higher)
+    if (!cb || !cb->user_data || !cb->run || !cb->priorities)
         return NULL;
 
     e = av_mallocz(sizeof(*e));
@@ -153,6 +171,10 @@  AVExecutor* av_executor_alloc(const AVTaskCallbacks *cb, int thread_count)
     if (!e->local_contexts)
         goto free_executor;
 
+    e->q = av_calloc(e->cb.priorities, sizeof(Queue));
+    if (!e->q)
+        goto free_executor;
+
     e->threads = av_calloc(FFMAX(thread_count, 1), sizeof(*e->threads));
     if (!e->threads)
         goto free_executor;
@@ -192,16 +214,10 @@  void av_executor_free(AVExecutor **executor)
 
 void av_executor_execute(AVExecutor *e, AVTask *t)
 {
-    AVTaskCallbacks *cb = &e->cb;
-    AVTask **prev;
-
     if (e->thread_count)
         ff_mutex_lock(&e->lock);
-    if (t) {
-        for (prev = &e->tasks; *prev && cb->priority_higher(*prev, t); prev = &(*prev)->next)
-            /* nothing */;
-        add_task(prev, t);
-    }
+    if (t)
+        add_task(e->q + t->priority % e->cb.priorities, t);
     if (e->thread_count) {
         ff_cond_signal(&e->cond);
         ff_mutex_unlock(&e->lock);
diff --git a/libavcodec/executor.h b/libavcodec/executor.h
index 29fb55f66b..2398acd56c 100644
--- a/libavcodec/executor.h
+++ b/libavcodec/executor.h
@@ -31,6 +31,7 @@  typedef struct AVExecutor AVExecutor;
 typedef struct AVTask AVTask;
 
 struct AVTask {
+    int priority;   // task priority should >= 0 and < AVTaskCallbacks.priorities
     AVTask *next;
 };
 
@@ -39,8 +40,8 @@  typedef struct AVTaskCallbacks {
 
     int local_context_size;
 
-    // return 1 if a's priority > b's priority
-    int (*priority_higher)(const AVTask *a, const AVTask *b);
+    // How many priorities do we have?
+    int priorities;
 
     // run the task
     int (*run)(AVTask *t, void *local_context, void *user_data);
diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
index 993b682e1b..1736092abe 100644
--- a/libavcodec/vvc/thread.c
+++ b/libavcodec/vvc/thread.c
@@ -103,13 +103,28 @@  typedef struct VVCFrameThread {
     AVCond  cond;
 } VVCFrameThread;
 
+#define PRIORITY_LOWEST 2
 static void add_task(VVCContext *s, VVCTask *t)
 {
-    VVCFrameThread *ft = t->fc->ft;
+    VVCFrameThread *ft     = t->fc->ft;
+    AVTask *task           = &t->u.task;
+    const int priorities[] = {
+        0,                  // VVC_TASK_STAGE_INIT,
+        0,                  // VVC_TASK_STAGE_PARSE,
+        // For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks.
+        // We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks.
+        PRIORITY_LOWEST,    // VVC_TASK_STAGE_INTER
+        1,                  // VVC_TASK_STAGE_RECON,
+        1,                  // VVC_TASK_STAGE_LMCS,
+        1,                  // VVC_TASK_STAGE_DEBLOCK_V,
+        1,                  // VVC_TASK_STAGE_DEBLOCK_H,
+        1,                  // VVC_TASK_STAGE_SAO,
+        1,                  // VVC_TASK_STAGE_ALF,
+    };
 
     atomic_fetch_add(&ft->nb_scheduled_tasks, 1);
-
-    av_executor_execute(s->executor, &t->u.task);
+    task->priority = priorities[t->stage];
+    av_executor_execute(s->executor, task);
 }
 
 static void task_init(VVCTask *t, VVCTaskStage stage, VVCFrameContext *fc, const int rx, const int ry)
@@ -372,31 +387,6 @@  static int task_is_stage_ready(VVCTask *t, int add)
     return task_has_target_score(t, stage, score);
 }
 
-#define CHECK(a, b)                         \
-    do {                                    \
-        if ((a) != (b))                     \
-            return (a) < (b);               \
-    } while (0)
-
-static int task_priority_higher(const AVTask *_a, const AVTask *_b)
-{
-    const VVCTask *a = (const VVCTask*)_a;
-    const VVCTask *b = (const VVCTask*)_b;
-
-
-    if (a->stage <= VVC_TASK_STAGE_PARSE || b->stage <= VVC_TASK_STAGE_PARSE) {
-        CHECK(a->stage, b->stage);
-        CHECK(a->fc->decode_order, b->fc->decode_order);           //decode order
-        CHECK(a->ry, b->ry);
-        return a->rx < b->rx;
-    }
-
-    CHECK(a->fc->decode_order, b->fc->decode_order);              //decode order
-    CHECK(a->rx + a->ry + a->stage, b->rx + b->ry + b->stage);    //zigzag with type
-    CHECK(a->rx + a->ry, b->rx + b->ry);                          //zigzag
-    return a->ry < b->ry;
-}
-
 static void check_colocation(VVCContext *s, VVCTask *t)
 {
     const VVCFrameContext *fc = t->fc;
@@ -681,7 +671,7 @@  AVExecutor* ff_vvc_executor_alloc(VVCContext *s, const int thread_count)
     AVTaskCallbacks callbacks = {
         s,
         sizeof(VVCLocalContext),
-        task_priority_higher,
+        PRIORITY_LOWEST + 1,
         task_run,
     };
     return av_executor_alloc(&callbacks, thread_count);