diff mbox series

[FFmpeg-devel,1/3] nlmeans_vulkan: fix width/height for chroma plane weights calculation

Message ID Ng9eVFH--3-9@lynne.ee
State New
Headers show
Series [FFmpeg-devel,1/3] nlmeans_vulkan: fix width/height for chroma plane weights calculation | expand

Commit Message

Lynne Oct. 7, 2023, 3:03 p.m. UTC
Patch attached.

Comments

Lynne Oct. 11, 2023, 3:34 a.m. UTC | #1
Oct 7, 2023, 17:08 by dev@lynne.ee:

> Removes the clever subgroup parallel prefix computation,
> and instead just computes the prefix inline.
> Cuts down the number of dispatches by a huge amount.
>
> Provides a ~12x speedup (2.5fps to 30fps on a 7900XTX,
> 2.1fps to 24fps on an Ada).
>
> Patch attached.
>

Going to push the patchset a bit later today.
diff mbox series

Patch

From 927c74d7851aafc589760a3882bef7f72b19db1c Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 16 Sep 2023 00:42:53 +0200
Subject: [PATCH 1/3] nlmeans_vulkan: fix width/height for chroma plane weights
 calculation

---
 libavfilter/vf_nlmeans_vulkan.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index 99f4f867e7..5b623eb7a6 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -100,7 +100,7 @@  static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first,
                                 gl_SemanticsMakeAvailable |
                                 gl_SemanticsMakeVisible);                     );
     }
-    GLSLC(1, for (y = 0; y < height[0]; y++) {                                );
+    GLSLF(1, for (y = 0; y < height[%i]; y++) {                               ,plane);
     GLSLC(2,     offset = uint64_t(int_stride)*y*T_ALIGN;                     );
     GLSLC(2,     dst = DataBuffer(uint64_t(integral_data) + offset);          );
     GLSLC(0,                                                                  );
@@ -127,7 +127,7 @@  static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, i
                                 gl_SemanticsMakeAvailable |
                                 gl_SemanticsMakeVisible);                     );
     }
-    GLSLC(1, for (x = 0; x < width[0]; x++) {                                 );
+    GLSLF(1, for (x = 0; x < width[%i]; x++) {                                ,plane);
     GLSLC(2,     dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN);       );
 
     for (int r = 0; r < nb_rows; r++) {
@@ -156,13 +156,13 @@  static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
                             gl_SemanticsMakeVisible);                         );
     GLSLC(1, barrier();                                                       );
     if (!vert) {
-        GLSLC(1, for (y = 0; y < height[0]; y++) {                            );
+        GLSLF(1, for (y = 0; y < height[%i]; y++) {                           ,plane);
         GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= width[%i])             ,nb_rows, plane);
         GLSLC(3,         break;                                               );
         GLSLF(2,     for (r = 0; r < %i; r++) {                       ,nb_rows);
         GLSLF(3,         x = int(gl_GlobalInvocationID.x) * %i + r;   ,nb_rows);
     } else {
-        GLSLC(1, for (x = 0; x < width[0]; x++) {                             );
+        GLSLF(1, for (x = 0; x < width[%i]; x++) {                            ,plane);
         GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= height[%i])            ,nb_rows, plane);
         GLSLC(3,         break;                                               );
         GLSLF(2,     for (r = 0; r < %i; r++) {                       ,nb_rows);
-- 
2.42.0