diff mbox

[FFmpeg-devel,00/10] Vulkan code general/performance improvements

Message ID M7ObtWF--3-2@lynne.ee
State Accepted
Commit 727cac88b8c4b1facd93a3c863ef7e7072feda36
Headers show

Commit Message

Lynne May 15, 2020, 6:36 p.m. UTC
Just posting this as a single email not to spam the ML too much.
Going to push this sometime over the weekend alongside the other 3 patches to finish
all work on Vulkan before the next release.
Satisfied with the code, though the last patch is slightly ugly.
Performance improvements out of async code: 78% on a toy GPU with 1 queue for an upload+scale
Probably closer to 200% on a discrete GPU with 4 transfer/compute queues.
Subject: [PATCH 10/10] lavfi/vulkan: use all enabled queues in the queue
 family

This should significantly improve the performance with certain
filterchains.
---
 libavfilter/vf_avgblur_vulkan.c   |  39 ++--
 libavfilter/vf_chromaber_vulkan.c |  30 +--
 libavfilter/vf_overlay_vulkan.c   |  37 ++--
 libavfilter/vf_scale_vulkan.c     |  30 +--
 libavfilter/vulkan.c              | 296 +++++++++++++++++++++++-------
 libavfilter/vulkan.h              |  74 ++++++--
 6 files changed, 371 insertions(+), 135 deletions(-)

Comments

James Almer May 15, 2020, 6:47 p.m. UTC | #1
On 5/15/2020 3:36 PM, Lynne wrote:
> Just posting this as a single email not to spam the ML too much.
> Going to push this sometime over the weekend alongside the other 3 patches to finish
> all work on Vulkan before the next release.

The new release is not going to happen tomorrow or even next week, so
you could give people more than two days to look at a dozen patches...

> Satisfied with the code, though the last patch is slightly ugly.
> Performance improvements out of async code: 78% on a toy GPU with 1 queue for an upload+scale
> Probably closer to 200% on a discrete GPU with 4 transfer/compute queues.
Anton Khirnov May 18, 2020, 7:35 a.m. UTC | #2
Quoting James Almer (2020-05-15 20:47:56)
> On 5/15/2020 3:36 PM, Lynne wrote:
> > Just posting this as a single email not to spam the ML too much.
> > Going to push this sometime over the weekend alongside the other 3 patches to finish
> > all work on Vulkan before the next release.
> 
> The new release is not going to happen tomorrow or even next week, so
> you could give people more than two days to look at a dozen patches...

I'd add that individual emails are a lot easier to read/review. And if
you send them as a thread, which git-send-email does by default, it
doesn't really spam the ML any more than a single mail in any sane MUA.
Lynne May 23, 2020, 6:10 p.m. UTC | #3
May 15, 2020, 19:36 by dev@lynne.ee:

> Just posting this as a single email not to spam the ML too much.
> Going to push this sometime over the weekend alongside the other 3 patches to finish
> all work on Vulkan before the next release.
> Satisfied with the code, though the last patch is slightly ugly.
> Performance improvements out of async code: 78% on a toy GPU with 1 queue for an upload+scale
> Probably closer to 200% on a discrete GPU with 4 transfer/compute queues.
>

Pushed.
diff mbox

Patch

diff --git a/libavfilter/vf_avgblur_vulkan.c b/libavfilter/vf_avgblur_vulkan.c
index 105d753f73..12d57e0875 100644
--- a/libavfilter/vf_avgblur_vulkan.c
+++ b/libavfilter/vf_avgblur_vulkan.c
@@ -97,6 +97,10 @@  static av_cold int init_filter(AVFilterContext *ctx, AVFrame *in)
     if (!sampler)
         return AVERROR_EXTERNAL;
 
+    s->vkctx.queue_family_idx = s->vkctx.hwctx->queue_family_comp_index;
+    s->vkctx.queue_count = GET_QUEUE_COUNT(s->vkctx.hwctx, 0, 1, 0);
+    s->vkctx.cur_queue_idx = rand() % s->vkctx.queue_count;
+
     { /* Create shader for the horizontal pass */
         desc_i[0].updater = s->input_images;
         desc_i[1].updater = s->tmp_images;
@@ -184,8 +188,7 @@  static av_cold int init_filter(AVFilterContext *ctx, AVFrame *in)
     }
 
     /* Execution context */
-    RET(ff_vk_create_exec_ctx(ctx, &s->exec,
-                              s->vkctx.hwctx->queue_family_comp_index));
+    RET(ff_vk_create_exec_ctx(ctx, &s->exec));
 
     s->initialized = 1;
 
@@ -198,22 +201,30 @@  fail:
 static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *tmp_f, AVFrame *in_f)
 {
     int err;
+    VkCommandBuffer cmd_buf;
     AvgBlurVulkanContext *s = avctx->priv;
     AVVkFrame *in = (AVVkFrame *)in_f->data[0];
     AVVkFrame *tmp = (AVVkFrame *)tmp_f->data[0];
     AVVkFrame *out = (AVVkFrame *)out_f->data[0];
     int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
 
+    /* Update descriptors and init the exec context */
+    ff_vk_start_exec_recording(avctx, s->exec);
+    cmd_buf = ff_vk_get_exec_buf(avctx, s->exec);
+
     for (int i = 0; i < planes; i++) {
-        RET(ff_vk_create_imageview(avctx, &s->input_images[i].imageView, in->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->input_images[i].imageView,
+                                   in->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.input_format)[i],
                                    ff_comp_identity_map));
 
-        RET(ff_vk_create_imageview(avctx, &s->tmp_images[i].imageView, tmp->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->tmp_images[i].imageView,
+                                   tmp->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.output_format)[i],
                                    ff_comp_identity_map));
 
-        RET(ff_vk_create_imageview(avctx, &s->output_images[i].imageView, out->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->output_images[i].imageView,
+                                   out->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.output_format)[i],
                                    ff_comp_identity_map));
 
@@ -225,8 +236,6 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *tmp_f
     ff_vk_update_descriptor_set(avctx, s->pl_hor, 0);
     ff_vk_update_descriptor_set(avctx, s->pl_ver, 0);
 
-    ff_vk_start_exec_recording(avctx, s->exec);
-
     for (int i = 0; i < planes; i++) {
         VkImageMemoryBarrier bar[] = {
             {
@@ -270,7 +279,7 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *tmp_f
             },
         };
 
-        vkCmdPipelineBarrier(s->exec->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+        vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                              VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
                              0, NULL, 0, NULL, FF_ARRAY_ELEMS(bar), bar);
 
@@ -286,12 +295,12 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *tmp_f
 
     ff_vk_bind_pipeline_exec(avctx, s->exec, s->pl_hor);
 
-    vkCmdDispatch(s->exec->buf, FFALIGN(s->vkctx.output_width, CGS)/CGS,
+    vkCmdDispatch(cmd_buf, FFALIGN(s->vkctx.output_width, CGS)/CGS,
                   s->vkctx.output_height, 1);
 
     ff_vk_bind_pipeline_exec(avctx, s->exec, s->pl_ver);
 
-    vkCmdDispatch(s->exec->buf, s->vkctx.output_width,
+    vkCmdDispatch(cmd_buf, s->vkctx.output_width,
                   FFALIGN(s->vkctx.output_height, CGS)/CGS, 1);
 
     ff_vk_add_exec_dep(avctx, s->exec, in_f, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
@@ -301,14 +310,10 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *tmp_f
     if (err)
         return err;
 
-fail:
-
-    for (int i = 0; i < planes; i++) {
-        ff_vk_destroy_imageview(avctx, &s->input_images[i].imageView);
-        ff_vk_destroy_imageview(avctx, &s->tmp_images[i].imageView);
-        ff_vk_destroy_imageview(avctx, &s->output_images[i].imageView);
-    }
+    return err;
 
+fail:
+    ff_vk_discard_exec_deps(avctx, s->exec);
     return err;
 }
 
diff --git a/libavfilter/vf_chromaber_vulkan.c b/libavfilter/vf_chromaber_vulkan.c
index 673b3a7a68..1bee5e10f8 100644
--- a/libavfilter/vf_chromaber_vulkan.c
+++ b/libavfilter/vf_chromaber_vulkan.c
@@ -73,6 +73,10 @@  static av_cold int init_filter(AVFilterContext *ctx, AVFrame *in)
     if (!sampler)
         return AVERROR_EXTERNAL;
 
+    s->vkctx.queue_family_idx = s->vkctx.hwctx->queue_family_comp_index;
+    s->vkctx.queue_count = GET_QUEUE_COUNT(s->vkctx.hwctx, 0, 1, 0);
+    s->vkctx.cur_queue_idx = rand() % s->vkctx.queue_count;
+
     s->pl = ff_vk_create_pipeline(ctx);
     if (!s->pl)
         return AVERROR(ENOMEM);
@@ -154,8 +158,7 @@  static av_cold int init_filter(AVFilterContext *ctx, AVFrame *in)
     RET(ff_vk_init_compute_pipeline(ctx, s->pl));
 
     /* Execution context */
-    RET(ff_vk_create_exec_ctx(ctx, &s->exec,
-                              s->vkctx.hwctx->queue_family_comp_index));
+    RET(ff_vk_create_exec_ctx(ctx, &s->exec));
 
     s->initialized = 1;
 
@@ -168,17 +171,24 @@  fail:
 static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
 {
     int err = 0;
+    VkCommandBuffer cmd_buf;
     ChromaticAberrationVulkanContext *s = avctx->priv;
     AVVkFrame *in = (AVVkFrame *)in_f->data[0];
     AVVkFrame *out = (AVVkFrame *)out_f->data[0];
     int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
 
+    /* Update descriptors and init the exec context */
+    ff_vk_start_exec_recording(avctx, s->exec);
+    cmd_buf = ff_vk_get_exec_buf(avctx, s->exec);
+
     for (int i = 0; i < planes; i++) {
-        RET(ff_vk_create_imageview(avctx, &s->input_images[i].imageView, in->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->input_images[i].imageView,
+                                   in->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.input_format)[i],
                                    ff_comp_identity_map));
 
-        RET(ff_vk_create_imageview(avctx, &s->output_images[i].imageView, out->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->output_images[i].imageView,
+                                   out->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.output_format)[i],
                                    ff_comp_identity_map));
 
@@ -188,8 +198,6 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
 
     ff_vk_update_descriptor_set(avctx, s->pl, 0);
 
-    ff_vk_start_exec_recording(avctx, s->exec);
-
     for (int i = 0; i < planes; i++) {
         VkImageMemoryBarrier bar[2] = {
             {
@@ -220,7 +228,7 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
             },
         };
 
-        vkCmdPipelineBarrier(s->exec->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+        vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                              VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
                              0, NULL, 0, NULL, FF_ARRAY_ELEMS(bar), bar);
 
@@ -236,7 +244,7 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
     ff_vk_update_push_exec(avctx, s->exec, VK_SHADER_STAGE_COMPUTE_BIT,
                            0, sizeof(s->opts), &s->opts);
 
-    vkCmdDispatch(s->exec->buf,
+    vkCmdDispatch(cmd_buf,
                   FFALIGN(s->vkctx.output_width,  CGROUPS[0])/CGROUPS[0],
                   FFALIGN(s->vkctx.output_height, CGROUPS[1])/CGROUPS[1], 1);
 
@@ -247,12 +255,10 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
     if (err)
         return err;
 
-    for (int i = 0; i < planes; i++) {
-        ff_vk_destroy_imageview(avctx, &s->input_images[i].imageView);
-        ff_vk_destroy_imageview(avctx, &s->output_images[i].imageView);
-    }
+    return err;
 
 fail:
+    ff_vk_discard_exec_deps(avctx, s->exec);
     return err;
 }
 
diff --git a/libavfilter/vf_overlay_vulkan.c b/libavfilter/vf_overlay_vulkan.c
index 83cfae40e2..60a7356456 100644
--- a/libavfilter/vf_overlay_vulkan.c
+++ b/libavfilter/vf_overlay_vulkan.c
@@ -87,6 +87,10 @@  static av_cold int init_filter(AVFilterContext *ctx)
     if (!s->pl)
         return AVERROR(ENOMEM);
 
+    s->vkctx.queue_family_idx = s->vkctx.hwctx->queue_family_comp_index;
+    s->vkctx.queue_count = GET_QUEUE_COUNT(s->vkctx.hwctx, 0, 1, 0);
+    s->vkctx.cur_queue_idx = rand() % s->vkctx.queue_count;
+
     { /* Create the shader */
         const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
         const int ialpha = av_pix_fmt_desc_get(s->vkctx.input_format)->flags & AV_PIX_FMT_FLAG_ALPHA;
@@ -211,8 +215,7 @@  static av_cold int init_filter(AVFilterContext *ctx)
     }
 
     /* Execution context */
-    RET(ff_vk_create_exec_ctx(ctx, &s->exec,
-                              s->vkctx.hwctx->queue_family_comp_index));
+    RET(ff_vk_create_exec_ctx(ctx, &s->exec));
 
     s->initialized = 1;
 
@@ -226,6 +229,7 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f,
                           AVFrame *main_f, AVFrame *overlay_f)
 {
     int err;
+    VkCommandBuffer cmd_buf;
     OverlayVulkanContext *s = avctx->priv;
     int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
 
@@ -236,16 +240,23 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f,
     AVHWFramesContext *main_fc = (AVHWFramesContext*)main_f->hw_frames_ctx->data;
     AVHWFramesContext *overlay_fc = (AVHWFramesContext*)overlay_f->hw_frames_ctx->data;
 
+    /* Update descriptors and init the exec context */
+    ff_vk_start_exec_recording(avctx, s->exec);
+    cmd_buf = ff_vk_get_exec_buf(avctx, s->exec);
+
     for (int i = 0; i < planes; i++) {
-        RET(ff_vk_create_imageview(avctx, &s->main_images[i].imageView, main->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->main_images[i].imageView,
+                                   main->img[i],
                                    av_vkfmt_from_pixfmt(main_fc->sw_format)[i],
                                    ff_comp_identity_map));
 
-        RET(ff_vk_create_imageview(avctx, &s->overlay_images[i].imageView, overlay->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->overlay_images[i].imageView,
+                                   overlay->img[i],
                                    av_vkfmt_from_pixfmt(overlay_fc->sw_format)[i],
                                    ff_comp_identity_map));
 
-        RET(ff_vk_create_imageview(avctx, &s->output_images[i].imageView, out->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->output_images[i].imageView,
+                                   out->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.output_format)[i],
                                    ff_comp_identity_map));
 
@@ -256,8 +267,6 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f,
 
     ff_vk_update_descriptor_set(avctx, s->pl, 0);
 
-    ff_vk_start_exec_recording(avctx, s->exec);
-
     for (int i = 0; i < planes; i++) {
         VkImageMemoryBarrier bar[3] = {
             {
@@ -301,7 +310,7 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f,
             },
         };
 
-        vkCmdPipelineBarrier(s->exec->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+        vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                              VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
                              0, NULL, 0, NULL, FF_ARRAY_ELEMS(bar), bar);
 
@@ -317,7 +326,7 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f,
 
     ff_vk_bind_pipeline_exec(avctx, s->exec, s->pl);
 
-    vkCmdDispatch(s->exec->buf,
+    vkCmdDispatch(cmd_buf,
                   FFALIGN(s->vkctx.output_width,  CGROUPS[0])/CGROUPS[0],
                   FFALIGN(s->vkctx.output_height, CGROUPS[1])/CGROUPS[1], 1);
 
@@ -329,14 +338,10 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f,
     if (err)
         return err;
 
-fail:
-
-    for (int i = 0; i < planes; i++) {
-        ff_vk_destroy_imageview(avctx, &s->main_images[i].imageView);
-        ff_vk_destroy_imageview(avctx, &s->overlay_images[i].imageView);
-        ff_vk_destroy_imageview(avctx, &s->output_images[i].imageView);
-    }
+    return err;
 
+fail:
+    ff_vk_discard_exec_deps(avctx, s->exec);
     return err;
 }
 
diff --git a/libavfilter/vf_scale_vulkan.c b/libavfilter/vf_scale_vulkan.c
index 328e6bcce5..9b2e5b92f6 100644
--- a/libavfilter/vf_scale_vulkan.c
+++ b/libavfilter/vf_scale_vulkan.c
@@ -115,6 +115,10 @@  static av_cold int init_filter(AVFilterContext *ctx, AVFrame *in)
     int crop_w = in->width - (in->crop_left + in->crop_right);
     int crop_h = in->height - (in->crop_top + in->crop_bottom);
 
+    s->vkctx.queue_family_idx = s->vkctx.hwctx->queue_family_comp_index;
+    s->vkctx.queue_count = GET_QUEUE_COUNT(s->vkctx.hwctx, 0, 1, 0);
+    s->vkctx.cur_queue_idx = rand() % s->vkctx.queue_count;
+
     switch (s->scaler) {
     case F_NEAREST:
         sampler_mode = VK_FILTER_NEAREST;
@@ -276,8 +280,7 @@  static av_cold int init_filter(AVFilterContext *ctx, AVFrame *in)
     }
 
     /* Execution context */
-    RET(ff_vk_create_exec_ctx(ctx, &s->exec,
-                              s->vkctx.hwctx->queue_family_comp_index));
+    RET(ff_vk_create_exec_ctx(ctx, &s->exec));
 
     s->initialized = 1;
 
@@ -290,14 +293,20 @@  fail:
 static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
 {
     int err = 0;
+    VkCommandBuffer cmd_buf;
     ScaleVulkanContext *s = avctx->priv;
     AVVkFrame *in = (AVVkFrame *)in_f->data[0];
     AVVkFrame *out = (AVVkFrame *)out_f->data[0];
     VkImageMemoryBarrier barriers[AV_NUM_DATA_POINTERS*2];
     int barrier_count = 0;
 
+    /* Update descriptors and init the exec context */
+    ff_vk_start_exec_recording(avctx, s->exec);
+    cmd_buf = ff_vk_get_exec_buf(avctx, s->exec);
+
     for (int i = 0; i < av_pix_fmt_count_planes(s->vkctx.input_format); i++) {
-        RET(ff_vk_create_imageview(avctx, &s->input_images[i].imageView, in->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->input_images[i].imageView,
+                                   in->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.input_format)[i],
                                    ff_comp_identity_map));
 
@@ -305,7 +314,8 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
     }
 
     for (int i = 0; i < av_pix_fmt_count_planes(s->vkctx.output_format); i++) {
-        RET(ff_vk_create_imageview(avctx, &s->output_images[i].imageView, out->img[i],
+        RET(ff_vk_create_imageview(avctx, s->exec, &s->output_images[i].imageView,
+                                   out->img[i],
                                    av_vkfmt_from_pixfmt(s->vkctx.output_format)[i],
                                    ff_comp_identity_map));
 
@@ -314,8 +324,6 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
 
     ff_vk_update_descriptor_set(avctx, s->pl, 0);
 
-    ff_vk_start_exec_recording(avctx, s->exec);
-
     for (int i = 0; i < av_pix_fmt_count_planes(s->vkctx.input_format); i++) {
         VkImageMemoryBarrier bar = {
             .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
@@ -358,13 +366,13 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
         out->access[i] = bar.dstAccessMask;
     }
 
-    vkCmdPipelineBarrier(s->exec->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+    vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                          VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
                          0, NULL, 0, NULL, barrier_count, barriers);
 
     ff_vk_bind_pipeline_exec(avctx, s->exec, s->pl);
 
-    vkCmdDispatch(s->exec->buf,
+    vkCmdDispatch(cmd_buf,
                   FFALIGN(s->vkctx.output_width,  CGROUPS[0])/CGROUPS[0],
                   FFALIGN(s->vkctx.output_height, CGROUPS[1])/CGROUPS[1], 1);
 
@@ -375,12 +383,10 @@  static int process_frames(AVFilterContext *avctx, AVFrame *out_f, AVFrame *in_f)
     if (err)
         return err;
 
-    for (int i = 0; i < av_pix_fmt_count_planes(s->vkctx.input_format); i++)
-        ff_vk_destroy_imageview(avctx, &s->input_images[i].imageView);
-    for (int i = 0; i < av_pix_fmt_count_planes(s->vkctx.output_format); i++)
-        ff_vk_destroy_imageview(avctx, &s->output_images[i].imageView);
+    return err;
 
 fail:
+    ff_vk_discard_exec_deps(avctx, s->exec);
     return err;
 }
 
diff --git a/libavfilter/vulkan.c b/libavfilter/vulkan.c
index ccf71cb7cd..301ee4354f 100644
--- a/libavfilter/vulkan.c
+++ b/libavfilter/vulkan.c
@@ -311,72 +311,116 @@  int ff_vk_add_push_constant(AVFilterContext *avctx, VulkanPipeline *pl,
 }
 
 FN_CREATING(VulkanFilterContext, FFVkExecContext, exec_ctx, exec_ctx, exec_ctx_num)
-int ff_vk_create_exec_ctx(AVFilterContext *avctx, FFVkExecContext **ctx, int queue)
+int ff_vk_create_exec_ctx(AVFilterContext *avctx, FFVkExecContext **ctx)
 {
     VkResult ret;
     FFVkExecContext *e;
     VulkanFilterContext *s = avctx->priv;
 
+    int queue_family = s->queue_family_idx;
+    int nb_queues = s->queue_count;
+
     VkCommandPoolCreateInfo cqueue_create = {
         .sType              = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
         .flags              = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
-        .queueFamilyIndex   = queue,
+        .queueFamilyIndex   = queue_family,
     };
     VkCommandBufferAllocateInfo cbuf_create = {
         .sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
         .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-        .commandBufferCount = 1,
+        .commandBufferCount = nb_queues,
     };
-    VkFenceCreateInfo fence_spawn = { VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };
 
     e = create_exec_ctx(s);
     if (!e)
         return AVERROR(ENOMEM);
 
+    e->queues = av_mallocz(nb_queues * sizeof(*e->queues));
+    if (!e->queues)
+        return AVERROR(ENOMEM);
+
+    e->bufs = av_mallocz(nb_queues * sizeof(*e->bufs));
+    if (!e->bufs)
+        return AVERROR(ENOMEM);
+
+    /* Create command pool */
     ret = vkCreateCommandPool(s->hwctx->act_dev, &cqueue_create,
                               s->hwctx->alloc, &e->pool);
     if (ret != VK_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR, "Command pool creation failure: %s\n",
                ff_vk_ret2str(ret));
-        return 1;
+        return AVERROR_EXTERNAL;
     }
 
     cbuf_create.commandPool = e->pool;
 
-    ret = vkAllocateCommandBuffers(s->hwctx->act_dev, &cbuf_create, &e->buf);
+    /* Allocate command buffer */
+    ret = vkAllocateCommandBuffers(s->hwctx->act_dev, &cbuf_create, e->bufs);
     if (ret != VK_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR, "Command buffer alloc failure: %s\n",
                ff_vk_ret2str(ret));
-        return 1;
+        return AVERROR_EXTERNAL;
     }
 
-    ret = vkCreateFence(s->hwctx->act_dev, &fence_spawn,
-                        s->hwctx->alloc, &e->fence);
-    if (ret != VK_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Failed to create frame fence: %s\n",
-               ff_vk_ret2str(ret));
-        return 1;
+    for (int i = 0; i < nb_queues; i++) {
+        FFVkQueueCtx *q = &e->queues[i];
+        vkGetDeviceQueue(s->hwctx->act_dev, queue_family, i, &q->queue);
     }
 
-    vkGetDeviceQueue(s->hwctx->act_dev, queue, 0, &e->queue);
-
     *ctx = e;
 
     return 0;
 }
 
+void ff_vk_discard_exec_deps(AVFilterContext *avctx, FFVkExecContext *e)
+{
+    VulkanFilterContext *s = avctx->priv;
+    FFVkQueueCtx *q = &e->queues[s->cur_queue_idx];
+
+    for (int j = 0; j < q->nb_buf_deps; j++)
+        av_buffer_unref(&q->buf_deps[j]);
+    q->nb_buf_deps = 0;
+
+    for (int j = 0; j < q->nb_frame_deps; j++)
+        av_frame_free(&q->frame_deps[j]);
+    q->nb_frame_deps = 0;
+
+    e->sem_wait_cnt = 0;
+    e->sem_sig_cnt = 0;
+}
+
 int ff_vk_start_exec_recording(AVFilterContext *avctx, FFVkExecContext *e)
 {
     VkResult ret;
+    VulkanFilterContext *s = avctx->priv;
+    FFVkQueueCtx *q = &e->queues[s->cur_queue_idx];
+
     VkCommandBufferBeginInfo cmd_start = {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
         .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
     };
 
-    e->sem_wait_cnt = 0;
-    e->sem_sig_cnt = 0;
+    /* Create the fence and don't wait for it initially */
+    if (!q->fence) {
+        VkFenceCreateInfo fence_spawn = {
+            .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+        };
+        ret = vkCreateFence(s->hwctx->act_dev, &fence_spawn, s->hwctx->alloc,
+                            &q->fence);
+        if (ret != VK_SUCCESS) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to queue frame fence: %s\n",
+                   ff_vk_ret2str(ret));
+            return AVERROR_EXTERNAL;
+        }
+    } else {
+        vkWaitForFences(s->hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+        vkResetFences(s->hwctx->act_dev, 1, &q->fence);
+    }
 
-    ret = vkBeginCommandBuffer(e->buf, &cmd_start);
+    /* Discard queue dependencies */
+    ff_vk_discard_exec_deps(avctx, e);
+
+    ret = vkBeginCommandBuffer(e->bufs[s->cur_queue_idx], &cmd_start);
     if (ret != VK_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR, "Failed to start command recoding: %s\n",
                ff_vk_ret2str(ret));
@@ -386,28 +430,43 @@  int ff_vk_start_exec_recording(AVFilterContext *avctx, FFVkExecContext *e)
     return 0;
 }
 
+VkCommandBuffer ff_vk_get_exec_buf(AVFilterContext *avctx, FFVkExecContext *e)
+{
+    VulkanFilterContext *s = avctx->priv;
+    return e->bufs[s->cur_queue_idx];
+}
+
 int ff_vk_add_exec_dep(AVFilterContext *avctx, FFVkExecContext *e,
                        AVFrame *frame, VkPipelineStageFlagBits in_wait_dst_flag)
 {
+    AVFrame **dst;
+    VulkanFilterContext *s = avctx->priv;
     AVVkFrame *f = (AVVkFrame *)frame->data[0];
+    FFVkQueueCtx *q = &e->queues[s->cur_queue_idx];
     AVHWFramesContext *fc = (AVHWFramesContext *)frame->hw_frames_ctx->data;
     int planes = av_pix_fmt_count_planes(fc->sw_format);
 
     for (int i = 0; i < planes; i++) {
         e->sem_wait = av_fast_realloc(e->sem_wait, &e->sem_wait_alloc,
                                       (e->sem_wait_cnt + 1)*sizeof(*e->sem_wait));
-        if (!e->sem_wait)
+        if (!e->sem_wait) {
+            ff_vk_discard_exec_deps(avctx, e);
             return AVERROR(ENOMEM);
+        }
 
         e->sem_wait_dst = av_fast_realloc(e->sem_wait_dst, &e->sem_wait_dst_alloc,
                                           (e->sem_wait_cnt + 1)*sizeof(*e->sem_wait_dst));
-        if (!e->sem_wait_dst)
+        if (!e->sem_wait_dst) {
+            ff_vk_discard_exec_deps(avctx, e);
             return AVERROR(ENOMEM);
+        }
 
         e->sem_sig = av_fast_realloc(e->sem_sig, &e->sem_sig_alloc,
                                      (e->sem_sig_cnt + 1)*sizeof(*e->sem_sig));
-        if (!e->sem_sig)
+        if (!e->sem_sig) {
+            ff_vk_discard_exec_deps(avctx, e);
             return AVERROR(ENOMEM);
+        }
 
         e->sem_wait[e->sem_wait_cnt] = f->sem[i];
         e->sem_wait_dst[e->sem_wait_cnt] = in_wait_dst_flag;
@@ -417,6 +476,21 @@  int ff_vk_add_exec_dep(AVFilterContext *avctx, FFVkExecContext *e,
         e->sem_sig_cnt++;
     }
 
+    dst = av_fast_realloc(q->frame_deps, &q->frame_deps_alloc_size,
+                          (q->nb_frame_deps + 1) * sizeof(*dst));
+    if (!dst) {
+        ff_vk_discard_exec_deps(avctx, e);
+        return AVERROR(ENOMEM);
+    }
+
+    q->frame_deps = dst;
+    q->frame_deps[q->nb_frame_deps] = av_frame_clone(frame);
+    if (!q->frame_deps[q->nb_frame_deps]) {
+        ff_vk_discard_exec_deps(avctx, e);
+        return AVERROR(ENOMEM);
+    }
+    q->nb_frame_deps++;
+
     return 0;
 }
 
@@ -424,11 +498,12 @@  int ff_vk_submit_exec_queue(AVFilterContext *avctx, FFVkExecContext *e)
 {
     VkResult ret;
     VulkanFilterContext *s = avctx->priv;
+    FFVkQueueCtx *q = &e->queues[s->cur_queue_idx];
 
     VkSubmitInfo s_info = {
         .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
         .commandBufferCount   = 1,
-        .pCommandBuffers      = &e->buf,
+        .pCommandBuffers      = &e->bufs[s->cur_queue_idx],
 
         .pWaitSemaphores      = e->sem_wait,
         .pWaitDstStageMask    = e->sem_wait_dst,
@@ -438,21 +513,57 @@  int ff_vk_submit_exec_queue(AVFilterContext *avctx, FFVkExecContext *e)
         .signalSemaphoreCount = e->sem_sig_cnt,
     };
 
-    vkEndCommandBuffer(e->buf);
+    ret = vkEndCommandBuffer(e->bufs[s->cur_queue_idx]);
+    if (ret != VK_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to finish command buffer: %s\n",
+               ff_vk_ret2str(ret));
+        return AVERROR_EXTERNAL;
+    }
 
-    ret = vkQueueSubmit(e->queue, 1, &s_info, e->fence);
+    ret = vkQueueSubmit(q->queue, 1, &s_info, q->fence);
     if (ret != VK_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR, "Unable to submit command buffer: %s\n",
                ff_vk_ret2str(ret));
         return AVERROR_EXTERNAL;
     }
 
-    vkWaitForFences(s->hwctx->act_dev, 1, &e->fence, VK_TRUE, UINT64_MAX);
-    vkResetFences(s->hwctx->act_dev, 1, &e->fence);
+    /* Rotate queues */
+    s->cur_queue_idx = (s->cur_queue_idx + 1) % s->queue_count;
 
     return 0;
 }
 
+int ff_vk_add_dep_exec_ctx(AVFilterContext *avctx, FFVkExecContext *e,
+                           AVBufferRef **deps, int nb_deps)
+{
+    AVBufferRef **dst;
+    VulkanFilterContext *s = avctx->priv;
+    FFVkQueueCtx *q = &e->queues[s->cur_queue_idx];
+
+    if (!deps || !nb_deps)
+        return 0;
+
+    dst = av_fast_realloc(q->buf_deps, &q->buf_deps_alloc_size,
+                          (q->nb_buf_deps + nb_deps) * sizeof(*dst));
+    if (!dst)
+        goto err;
+
+    q->buf_deps = dst;
+
+    for (int i = 0; i < nb_deps; i++) {
+        q->buf_deps[q->nb_buf_deps] = deps[i];
+        if (!q->buf_deps[q->nb_buf_deps])
+            goto err;
+        q->nb_buf_deps++;
+    }
+
+    return 0;
+
+err:
+    ff_vk_discard_exec_deps(avctx, e);
+    return AVERROR(ENOMEM);
+}
+
 int ff_vk_filter_query_formats(AVFilterContext *avctx)
 {
     static const enum AVPixelFormat pixel_formats[] = {
@@ -685,9 +796,24 @@  const char *ff_vk_shader_rep_fmt(enum AVPixelFormat pixfmt)
     return high ? "rgba16f" : "rgba8";
 }
 
-int ff_vk_create_imageview(AVFilterContext *avctx, VkImageView *v, VkImage img,
-                           VkFormat fmt, const VkComponentMapping map)
+typedef struct ImageViewCtx {
+    VkImageView view;
+} ImageViewCtx;
+
+static void destroy_imageview(void *opaque, uint8_t *data)
+{
+    VulkanFilterContext *s = opaque;
+    ImageViewCtx *iv = (ImageViewCtx *)data;
+    vkDestroyImageView(s->hwctx->act_dev, iv->view, s->hwctx->alloc);
+    av_free(iv);
+}
+
+int ff_vk_create_imageview(AVFilterContext *avctx, FFVkExecContext *e,
+                           VkImageView *v, VkImage img, VkFormat fmt,
+                           const VkComponentMapping map)
 {
+    int err;
+    AVBufferRef *buf;
     VulkanFilterContext *s = avctx->priv;
     VkImageViewCreateInfo imgview_spawn = {
         .sType      = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
@@ -705,24 +831,32 @@  int ff_vk_create_imageview(AVFilterContext *avctx, VkImageView *v, VkImage img,
         },
     };
 
+    ImageViewCtx *iv = av_mallocz(sizeof(*iv));
+
     VkResult ret = vkCreateImageView(s->hwctx->act_dev, &imgview_spawn,
-                                     s->hwctx->alloc, v);
+                                     s->hwctx->alloc, &iv->view);
     if (ret != VK_SUCCESS) {
-        av_log(s, AV_LOG_ERROR, "Failed to create imageview: %s\n",
+        av_log(avctx, AV_LOG_ERROR, "Failed to create imageview: %s\n",
                ff_vk_ret2str(ret));
         return AVERROR_EXTERNAL;
     }
 
-    return 0;
-}
+    buf = av_buffer_create((uint8_t *)iv, sizeof(*iv), destroy_imageview, s, 0);
+    if (!buf) {
+        destroy_imageview(s, (uint8_t *)iv);
+        return AVERROR(ENOMEM);
+    }
 
-void ff_vk_destroy_imageview(AVFilterContext *avctx, VkImageView *v)
-{
-    VulkanFilterContext *s = avctx->priv;
-    if (v && *v) {
-        vkDestroyImageView(s->hwctx->act_dev, *v, s->hwctx->alloc);
-        *v = NULL;
+    /* Add to queue dependencies */
+    err = ff_vk_add_dep_exec_ctx(avctx, e, &buf, 1);
+    if (err) {
+        av_buffer_unref(&buf);
+        return err;
     }
+
+    *v = iv->view;
+
+    return 0;
 }
 
 FN_CREATING(VulkanPipeline, SPIRVShader, shader, shaders, shaders_num)
@@ -870,11 +1004,11 @@  int ff_vk_add_descriptor_set(AVFilterContext *avctx, VulkanPipeline *pl,
         goto print;
 
     pl->desc_layout = av_realloc_array(pl->desc_layout, sizeof(*pl->desc_layout),
-                                       pl->descriptor_sets_num + 1);
+                                       pl->desc_layout_num + 1);
     if (!pl->desc_layout)
         return AVERROR(ENOMEM);
 
-    layout = &pl->desc_layout[pl->descriptor_sets_num];
+    layout = &pl->desc_layout[pl->desc_layout_num];
     memset(layout, 0, sizeof(*layout));
 
     { /* Create descriptor set layout descriptions */
@@ -946,11 +1080,11 @@  int ff_vk_add_descriptor_set(AVFilterContext *avctx, VulkanPipeline *pl,
 
         pl->desc_template_info = av_realloc_array(pl->desc_template_info,
                                                   sizeof(*pl->desc_template_info),
-                                                  pl->descriptor_sets_num + 1);
+                                                  pl->desc_layout_num + 1);
         if (!pl->desc_template_info)
             return AVERROR(ENOMEM);
 
-        dt = &pl->desc_template_info[pl->descriptor_sets_num];
+        dt = &pl->desc_template_info[pl->desc_layout_num];
         memset(dt, 0, sizeof(*dt));
 
         dt->sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO;
@@ -960,13 +1094,13 @@  int ff_vk_add_descriptor_set(AVFilterContext *avctx, VulkanPipeline *pl,
         dt->descriptorUpdateEntryCount = num;
     }
 
-    pl->descriptor_sets_num++;
+    pl->desc_layout_num++;
 
 print:
     /* Write shader info */
     for (int i = 0; i < num; i++) {
         const struct descriptor_props *prop = &descriptor_props[desc[i].type];
-        GLSLA("layout (set = %i, binding = %i", pl->descriptor_sets_num - 1, i);
+        GLSLA("layout (set = %i, binding = %i", pl->desc_layout_num - 1, i);
 
         if (desc[i].mem_layout)
             GLSLA(", %s", desc[i].mem_layout);
@@ -1004,15 +1138,17 @@  void ff_vk_update_descriptor_set(AVFilterContext *avctx, VulkanPipeline *pl,
     VulkanFilterContext *s = avctx->priv;
 
     vkUpdateDescriptorSetWithTemplate(s->hwctx->act_dev,
-                                      pl->desc_set[set_id],
-                                      pl->desc_template[set_id], s);
+                                      pl->desc_set[set_id * s->cur_queue_idx],
+                                      pl->desc_template[set_id],
+                                      s);
 }
 
 void ff_vk_update_push_exec(AVFilterContext *avctx, FFVkExecContext *e,
                             VkShaderStageFlagBits stage, int offset,
                             size_t size, void *src)
 {
-    vkCmdPushConstants(e->buf, e->bound_pl->pipeline_layout,
+    VulkanFilterContext *s = avctx->priv;
+    vkCmdPushConstants(e->bufs[s->cur_queue_idx], e->bound_pl->pipeline_layout,
                        stage, offset, size, src);
 }
 
@@ -1021,6 +1157,10 @@  int ff_vk_init_pipeline_layout(AVFilterContext *avctx, VulkanPipeline *pl)
     VkResult ret;
     VulkanFilterContext *s = avctx->priv;
 
+    int queues_count = 1;
+
+    pl->descriptor_sets_num = pl->desc_layout_num * queues_count;
+
     { /* Init descriptor set pool */
         VkDescriptorPoolCreateInfo pool_create_info = {
             .sType         = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
@@ -1063,7 +1203,7 @@  int ff_vk_init_pipeline_layout(AVFilterContext *avctx, VulkanPipeline *pl)
     { /* Finally create the pipeline layout */
         VkPipelineLayoutCreateInfo spawn_pipeline_layout = {
             .sType                  = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
-            .setLayoutCount         = pl->descriptor_sets_num,
+            .setLayoutCount         = pl->desc_layout_num,
             .pSetLayouts            = pl->desc_layout,
             .pushConstantRangeCount = pl->push_consts_num,
             .pPushConstantRanges    = pl->push_consts,
@@ -1089,7 +1229,7 @@  int ff_vk_init_pipeline_layout(AVFilterContext *avctx, VulkanPipeline *pl)
 
         /* Create update templates for the descriptor sets */
         for (int i = 0; i < pl->descriptor_sets_num; i++) {
-            desc_template_info = &pl->desc_template_info[i];
+            desc_template_info = &pl->desc_template_info[i % pl->desc_layout_num];
             desc_template_info->pipelineLayout = pl->pipeline_layout;
             ret = vkCreateDescriptorUpdateTemplate(s->hwctx->act_dev,
                                                    desc_template_info,
@@ -1153,27 +1293,53 @@  int ff_vk_init_compute_pipeline(AVFilterContext *avctx, VulkanPipeline *pl)
 void ff_vk_bind_pipeline_exec(AVFilterContext *avctx, FFVkExecContext *e,
                               VulkanPipeline *pl)
 {
-    vkCmdBindPipeline(e->buf, pl->bind_point, pl->pipeline);
+    VulkanFilterContext *s = avctx->priv;
+
+    vkCmdBindPipeline(e->bufs[s->cur_queue_idx], pl->bind_point, pl->pipeline);
 
-    vkCmdBindDescriptorSets(e->buf, pl->bind_point, pl->pipeline_layout, 0,
-                            pl->descriptor_sets_num, pl->desc_set, 0, 0);
+    vkCmdBindDescriptorSets(e->bufs[s->cur_queue_idx], pl->bind_point,
+                            pl->pipeline_layout, 0, pl->descriptor_sets_num,
+                            pl->desc_set, 0, 0);
 
     e->bound_pl = pl;
 }
 
 static void free_exec_ctx(VulkanFilterContext *s, FFVkExecContext *e)
 {
-    vkDestroyFence(s->hwctx->act_dev, e->fence, s->hwctx->alloc);
+    /* Make sure all queues have finished executing */
+    for (int i = 0; i < s->queue_count; i++) {
+        FFVkQueueCtx *q = &e->queues[i];
 
-    if (e->buf   != VK_NULL_HANDLE)
-        vkFreeCommandBuffers(s->hwctx->act_dev, e->pool, 1, &e->buf);
-    if (e->pool  != VK_NULL_HANDLE)
-        vkDestroyCommandPool(s->hwctx->act_dev, e->pool, s->hwctx->alloc);
+        if (q->fence) {
+            vkWaitForFences(s->hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+            vkResetFences(s->hwctx->act_dev, 1, &q->fence);
+        }
+
+        /* Free the fence */
+        if (q->fence)
+            vkDestroyFence(s->hwctx->act_dev, q->fence, s->hwctx->alloc);
 
-    av_free(e->sem_wait);
-    av_free(e->sem_wait_dst);
-    av_free(e->sem_sig);
+        /* Free buffer dependencies */
+        for (int j = 0; j < q->nb_buf_deps; j++)
+            av_buffer_unref(&q->buf_deps[j]);
+        av_free(q->buf_deps);
 
+        /* Free frame dependencies */
+        for (int j = 0; j < q->nb_frame_deps; j++)
+            av_frame_free(&q->frame_deps[j]);
+        av_free(q->frame_deps);
+    }
+
+    if (e->bufs)
+        vkFreeCommandBuffers(s->hwctx->act_dev, e->pool, s->queue_count, e->bufs);
+    if (e->pool)
+        vkDestroyCommandPool(s->hwctx->act_dev, e->pool, s->hwctx->alloc);
+
+    av_freep(&e->bufs);
+    av_freep(&e->queues);
+    av_freep(&e->sem_sig);
+    av_freep(&e->sem_wait);
+    av_freep(&e->sem_wait_dst);
     av_free(e);
 }
 
@@ -1191,7 +1357,7 @@  static void free_pipeline(VulkanFilterContext *s, VulkanPipeline *pl)
     vkDestroyPipelineLayout(s->hwctx->act_dev, pl->pipeline_layout,
                             s->hwctx->alloc);
 
-    for (int i = 0; i < pl->descriptor_sets_num; i++) {
+    for (int i = 0; i < pl->desc_layout_num; i++) {
         if (pl->desc_template && pl->desc_template[i])
             vkDestroyDescriptorUpdateTemplate(s->hwctx->act_dev, pl->desc_template[i],
                                               s->hwctx->alloc);
@@ -1229,6 +1395,10 @@  void ff_vk_filter_uninit(AVFilterContext *avctx)
 
     glslang_uninit();
 
+    for (int i = 0; i < s->exec_ctx_num; i++)
+        free_exec_ctx(s, s->exec_ctx[i]);
+    av_freep(&s->exec_ctx);
+
     for (int i = 0; i < s->samplers_num; i++) {
         vkDestroySampler(s->hwctx->act_dev, *s->samplers[i], s->hwctx->alloc);
         av_free(s->samplers[i]);
@@ -1239,10 +1409,6 @@  void ff_vk_filter_uninit(AVFilterContext *avctx)
         free_pipeline(s, s->pipelines[i]);
     av_freep(&s->pipelines);
 
-    for (int i = 0; i < s->exec_ctx_num; i++)
-        free_exec_ctx(s, s->exec_ctx[i]);
-    av_freep(&s->exec_ctx);
-
     av_freep(&s->scratch);
     s->scratch_size = 0;
 
diff --git a/libavfilter/vulkan.h b/libavfilter/vulkan.h
index 30a64ce933..f9a4dc5839 100644
--- a/libavfilter/vulkan.h
+++ b/libavfilter/vulkan.h
@@ -49,6 +49,17 @@ 
             goto fail;                                                         \
     } while (0)
 
+/* Gets the queues count for a single queue family */
+#define GET_QUEUE_COUNT(hwctx, graph, comp, tx) (                   \
+    graph ?  hwctx->nb_graphics_queues :                            \
+    comp  ? (hwctx->nb_comp_queues ?                                \
+             hwctx->nb_comp_queues : hwctx->nb_graphics_queues) :   \
+    tx    ? (hwctx->nb_tx_queues ? hwctx->nb_tx_queues :            \
+             (hwctx->nb_comp_queues ?                               \
+              hwctx->nb_comp_queues : hwctx->nb_graphics_queues)) : \
+    0                                                               \
+)
+
 /* Useful for attaching immutable samplers to arrays */
 #define DUP_SAMPLER_ARRAY4(x) (VkSampler []){ x, x, x, x, }
 
@@ -98,6 +109,7 @@  typedef struct VulkanPipeline {
     VkDescriptorPool            desc_pool;
     VkDescriptorSet            *desc_set;
     VkDescriptorUpdateTemplate *desc_template;
+    int                         desc_layout_num;
     int                         descriptor_sets_num;
     int                         pool_size_desc_num;
 
@@ -106,11 +118,29 @@  typedef struct VulkanPipeline {
     VkDescriptorPoolSize *pool_size_desc;
 } VulkanPipeline;
 
+typedef struct FFVkQueueCtx {
+    VkFence fence;
+    VkQueue queue;
+
+    /* Buffer dependencies */
+    AVBufferRef **buf_deps;
+    int nb_buf_deps;
+    int buf_deps_alloc_size;
+
+    /* Frame dependencies */
+    AVFrame **frame_deps;
+    int nb_frame_deps;
+    int frame_deps_alloc_size;
+} FFVkQueueCtx;
+
 typedef struct FFVkExecContext {
     VkCommandPool pool;
-    VkCommandBuffer buf;
-    VkQueue queue;
-    VkFence fence;
+    VkCommandBuffer *bufs;
+    FFVkQueueCtx *queues;
+
+    AVBufferRef ***deps;
+    int *nb_deps;
+    int *dep_alloc_size;
 
     VulkanPipeline *bound_pl;
 
@@ -134,6 +164,11 @@  typedef struct VulkanFilterContext {
     AVHWDeviceContext     *device;
     AVVulkanDeviceContext *hwctx;
 
+    /* State - mirrored with the exec ctx */
+    int cur_queue_idx;
+    int queue_family_idx;
+    int queue_count;
+
     /* Properties */
     int                 output_width;
     int                output_height;
@@ -192,15 +227,12 @@  VkSampler *ff_vk_init_sampler(AVFilterContext *avctx, int unnorm_coords,
 
 /**
  * Create an imageview.
+ * Guaranteed to remain alive until the queue submission has finished executing,
+ * and will be destroyed after that.
  */
-int ff_vk_create_imageview(AVFilterContext *avctx, VkImageView *v, VkImage img,
-                           VkFormat fmt, const VkComponentMapping map);
-
-/**
- * Destroy an imageview. Command buffer must have completed executing, which
- * ff_vk_submit_exec_queue() will ensure
- */
-void ff_vk_destroy_imageview(AVFilterContext *avctx, VkImageView *v);
+int ff_vk_create_imageview(AVFilterContext *avctx, FFVkExecContext *e,
+                           VkImageView *v, VkImage img, VkFormat fmt,
+                           const VkComponentMapping map);
 
 /**
  * Define a push constant for a given stage into a pipeline.
@@ -264,7 +296,7 @@  void ff_vk_update_descriptor_set(AVFilterContext *avctx, VulkanPipeline *pl,
  * Init an execution context for command recording and queue submission.
  * WIll be auto-freed on uninit.
  */
-int ff_vk_create_exec_ctx(AVFilterContext *avctx, FFVkExecContext **ctx, int queue);
+int ff_vk_create_exec_ctx(AVFilterContext *avctx, FFVkExecContext **ctx);
 
 /**
  * Begin recording to the command buffer. Previous execution must have been
@@ -288,7 +320,23 @@  void ff_vk_update_push_exec(AVFilterContext *avctx, FFVkExecContext *e,
                             size_t size, void *src);
 
 /**
- * Adds a frame as a queue dependency. This manages semaphore signalling.
+ * Gets the command buffer to use for this submission from the exe context.
+ */
+VkCommandBuffer ff_vk_get_exec_buf(AVFilterContext *avctx, FFVkExecContext *e);
+
+/**
+ * Adds a generic AVBufferRef as a queue depenency.
+ */
+int ff_vk_add_dep_exec_ctx(AVFilterContext *avctx, FFVkExecContext *e,
+                           AVBufferRef **deps, int nb_deps);
+
+/**
+ * Discards all queue dependencies
+ */
+void ff_vk_discard_exec_deps(AVFilterContext *avctx, FFVkExecContext *e);
+
+/**
+ * Adds a frame as a queue dependency. This also manages semaphore signalling.
  * Must be called before submission.
  */
 int ff_vk_add_exec_dep(AVFilterContext *avctx, FFVkExecContext *e,