diff mbox series

[FFmpeg-devel,v2] avfilter/scale_cuda: add support for rgb32/bgr32 conversions

Message ID 20230616165444.990190-1-philipl@overt.org
State New
Headers show
Series [FFmpeg-devel,v2] avfilter/scale_cuda: add support for rgb32/bgr32 conversions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Philip Langdale June 16, 2023, 4:54 p.m. UTC
As we are introducing two new formats and supporting conversions
between them, and also with the existing 0rgb32/0bgr32 formats, we get
a combinatorial explosion of kernels. I introduced a few new macros to
keep the things mostly managable.

The conversions are all simple, following existing patterns, with four
specific exceptions. When converting from 0rgb32/0bgr32 to rgb32/bgr32,
we need to ensure the alpha value is set to 1. In all other cases, it
can just be passed through, either to be used or ignored.
---
 libavfilter/vf_scale_cuda.c  |   2 +
 libavfilter/vf_scale_cuda.cu | 151 ++++++++++++++++++++++++++++-------
 2 files changed, 122 insertions(+), 31 deletions(-)
diff mbox series

Patch

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 1c99befec8..370cb1d9cd 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -51,6 +51,8 @@  static const enum AVPixelFormat supported_formats[] = {
     AV_PIX_FMT_YUV444P16,
     AV_PIX_FMT_0RGB32,
     AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
 };
 
 #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index c9c6cafdb6..de06ba9433 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -853,9 +853,67 @@  struct Convert_yuv444p16le_yuv444p16le
     }
 };
 
-// bgr0->X
-
-struct Convert_bgr0_bgr0
+#define DEF_CONVERT_IDENTITY(fmt1, fmt2)\
+                                        \
+struct Convert_##fmt1##_##fmt2          \
+{                                       \
+    static const int in_bit_depth = 8;  \
+    typedef uchar4 in_T;                \
+    typedef uchar in_T_uv;              \
+    typedef uchar4 out_T;               \
+    typedef uchar out_T_uv;             \
+                                        \
+    DEF_F(Convert, out_T)               \
+    {                                   \
+        DEFAULT_DST(0) = SUB_F(y, 0);   \
+    }                                   \
+                                        \
+    DEF_F(Convert_uv, out_T_uv)         \
+    {                                   \
+    }                                   \
+};                                      \
+
+#define DEF_CONVERT_REORDER(fmt1, fmt2) \
+                                        \
+struct Convert_##fmt1##_##fmt2          \
+{                                       \
+    static const int in_bit_depth = 8;  \
+    typedef uchar4 in_T;                \
+    typedef uchar in_T_uv;              \
+    typedef uchar4 out_T;               \
+    typedef uchar out_T_uv;             \
+                                        \
+    DEF_F(Convert, out_T)               \
+    {                                   \
+        uchar4 res = SUB_F(y, 0);       \
+        DEFAULT_DST(0) = make_uchar4(   \
+            res.z,                      \
+            res.y,                      \
+            res.x,                      \
+            res.w                       \
+        );                              \
+    }                                   \
+                                        \
+    DEF_F(Convert_uv, out_T_uv)         \
+    {                                   \
+    }                                   \
+};                                      \
+
+#define DEF_CONVERT_RGB(fmt1, fmt2)     \
+                                        \
+DEF_CONVERT_IDENTITY(fmt1, fmt1)        \
+DEF_CONVERT_REORDER (fmt1, fmt2)        \
+DEF_CONVERT_REORDER (fmt2, fmt1)        \
+DEF_CONVERT_IDENTITY(fmt2, fmt2)
+
+DEF_CONVERT_RGB(rgb0, bgr0)
+DEF_CONVERT_RGB(rgba, bgra)
+DEF_CONVERT_IDENTITY(rgba, rgb0)
+DEF_CONVERT_IDENTITY(bgra, bgr0)
+DEF_CONVERT_REORDER(rgba, bgr0)
+DEF_CONVERT_REORDER(bgra, rgb0)
+
+struct Convert_bgr0_bgra
 {
     static const int in_bit_depth = 8;
     typedef uchar4 in_T;
@@ -865,7 +923,13 @@  struct Convert_bgr0_bgr0
 
     DEF_F(Convert, out_T)
     {
-        DEFAULT_DST(0) = SUB_F(y, 0);
+        uchar4 res = SUB_F(y, 0);
+        DEFAULT_DST(0) = make_uchar4(
+            res.x,
+            res.y,
+            res.z,
+            1
+        );
     }
 
     DEF_F(Convert_uv, out_T_uv)
@@ -873,7 +937,7 @@  struct Convert_bgr0_bgr0
     }
 };
 
-struct Convert_bgr0_rgb0
+struct Convert_bgr0_rgba
 {
     static const int in_bit_depth = 8;
     typedef uchar4 in_T;
@@ -888,7 +952,7 @@  struct Convert_bgr0_rgb0
             res.z,
             res.y,
             res.x,
-            res.w
+            1
         );
     }
 
@@ -897,9 +961,7 @@  struct Convert_bgr0_rgb0
     }
 };
 
-// rgb0->X
-
-struct Convert_rgb0_bgr0
+struct Convert_rgb0_bgra
 {
     static const int in_bit_depth = 8;
     typedef uchar4 in_T;
@@ -914,7 +976,7 @@  struct Convert_rgb0_bgr0
             res.z,
             res.y,
             res.x,
-            res.w
+            1
         );
     }
 
@@ -923,7 +985,7 @@  struct Convert_rgb0_bgr0
     }
 };
 
-struct Convert_rgb0_rgb0
+struct Convert_rgb0_rgba
 {
     static const int in_bit_depth = 8;
     typedef uchar4 in_T;
@@ -933,7 +995,13 @@  struct Convert_rgb0_rgb0
 
     DEF_F(Convert, out_T)
     {
-        DEFAULT_DST(0) = SUB_F(y, 0);
+        uchar4 res = SUB_F(y, 0);
+        DEFAULT_DST(0) = make_uchar4(
+            res.x,
+            res.y,
+            res.z,
+            1
+        );
     }
 
     DEF_F(Convert_uv, out_T_uv)
@@ -1117,6 +1185,12 @@  extern "C" {
     NEAREST_KERNEL_RAW(p016le_ ## C)      \
     NEAREST_KERNEL_RAW(yuv444p16le_ ## C)
 
+#define NEAREST_KERNELS_RGB(C) \
+    NEAREST_KERNEL_RAW(rgb0_ ## C)  \
+    NEAREST_KERNEL_RAW(bgr0_ ## C)  \
+    NEAREST_KERNEL_RAW(rgba_ ## C)  \
+    NEAREST_KERNEL_RAW(bgra_ ## C)  \
+
 NEAREST_KERNELS(yuv420p)
 NEAREST_KERNELS(nv12)
 NEAREST_KERNELS(yuv444p)
@@ -1124,11 +1198,10 @@  NEAREST_KERNELS(p010le)
 NEAREST_KERNELS(p016le)
 NEAREST_KERNELS(yuv444p16le)
 
-NEAREST_KERNEL_RAW(bgr0_bgr0)
-NEAREST_KERNEL_RAW(rgb0_rgb0)
-NEAREST_KERNEL_RAW(bgr0_rgb0)
-NEAREST_KERNEL_RAW(rgb0_bgr0)
-
+NEAREST_KERNELS_RGB(rgb0)
+NEAREST_KERNELS_RGB(bgr0)
+NEAREST_KERNELS_RGB(rgba)
+NEAREST_KERNELS_RGB(bgra)
 
 #define BILINEAR_KERNEL(C, S) \
     __global__ void Subsample_Bilinear_##C##S(                      \
@@ -1152,6 +1225,12 @@  NEAREST_KERNEL_RAW(rgb0_bgr0)
     BILINEAR_KERNEL_RAW(p016le_ ## C)      \
     BILINEAR_KERNEL_RAW(yuv444p16le_ ## C)
 
+#define BILINEAR_KERNELS_RGB(C)     \
+    BILINEAR_KERNEL_RAW(rgb0_ ## C) \
+    BILINEAR_KERNEL_RAW(bgr0_ ## C) \
+    BILINEAR_KERNEL_RAW(rgba_ ## C) \
+    BILINEAR_KERNEL_RAW(bgra_ ## C)
+
 BILINEAR_KERNELS(yuv420p)
 BILINEAR_KERNELS(nv12)
 BILINEAR_KERNELS(yuv444p)
@@ -1159,10 +1238,10 @@  BILINEAR_KERNELS(p010le)
 BILINEAR_KERNELS(p016le)
 BILINEAR_KERNELS(yuv444p16le)
 
-BILINEAR_KERNEL_RAW(bgr0_bgr0)
-BILINEAR_KERNEL_RAW(rgb0_rgb0)
-BILINEAR_KERNEL_RAW(bgr0_rgb0)
-BILINEAR_KERNEL_RAW(rgb0_bgr0)
+BILINEAR_KERNELS_RGB(rgb0)
+BILINEAR_KERNELS_RGB(bgr0)
+BILINEAR_KERNELS_RGB(rgba)
+BILINEAR_KERNELS_RGB(bgra)
 
 #define BICUBIC_KERNEL(C, S) \
     __global__ void Subsample_Bicubic_##C##S(                                        \
@@ -1186,6 +1265,12 @@  BILINEAR_KERNEL_RAW(rgb0_bgr0)
     BICUBIC_KERNEL_RAW(p016le_ ## C)      \
     BICUBIC_KERNEL_RAW(yuv444p16le_ ## C)
 
+#define BICUBIC_KERNELS_RGB(C)      \
+    BICUBIC_KERNEL_RAW(rgb0_ ## C)  \
+    BICUBIC_KERNEL_RAW(bgr0_ ## C)  \
+    BICUBIC_KERNEL_RAW(rgba_ ## C)  \
+    BICUBIC_KERNEL_RAW(bgra_ ## C)
+
 BICUBIC_KERNELS(yuv420p)
 BICUBIC_KERNELS(nv12)
 BICUBIC_KERNELS(yuv444p)
@@ -1193,11 +1278,10 @@  BICUBIC_KERNELS(p010le)
 BICUBIC_KERNELS(p016le)
 BICUBIC_KERNELS(yuv444p16le)
 
-BICUBIC_KERNEL_RAW(bgr0_bgr0)
-BICUBIC_KERNEL_RAW(rgb0_rgb0)
-BICUBIC_KERNEL_RAW(bgr0_rgb0)
-BICUBIC_KERNEL_RAW(rgb0_bgr0)
-
+BICUBIC_KERNELS_RGB(rgb0)
+BICUBIC_KERNELS_RGB(bgr0)
+BICUBIC_KERNELS_RGB(rgba)
+BICUBIC_KERNELS_RGB(bgra)
 
 #define LANCZOS_KERNEL(C, S) \
     __global__ void Subsample_Lanczos_##C##S(                                        \
@@ -1221,6 +1305,12 @@  BICUBIC_KERNEL_RAW(rgb0_bgr0)
     LANCZOS_KERNEL_RAW(p016le_ ## C)      \
     LANCZOS_KERNEL_RAW(yuv444p16le_ ## C)
 
+#define LANCZOS_KERNELS_RGB(C)      \
+    LANCZOS_KERNEL_RAW(rgb0_ ## C)  \
+    LANCZOS_KERNEL_RAW(bgr0_ ## C)  \
+    LANCZOS_KERNEL_RAW(rgba_ ## C)  \
+    LANCZOS_KERNEL_RAW(bgra_ ## C)
+
 LANCZOS_KERNELS(yuv420p)
 LANCZOS_KERNELS(nv12)
 LANCZOS_KERNELS(yuv444p)
@@ -1228,9 +1318,8 @@  LANCZOS_KERNELS(p010le)
 LANCZOS_KERNELS(p016le)
 LANCZOS_KERNELS(yuv444p16le)
 
-LANCZOS_KERNEL_RAW(bgr0_bgr0)
-LANCZOS_KERNEL_RAW(rgb0_rgb0)
-LANCZOS_KERNEL_RAW(bgr0_rgb0)
-LANCZOS_KERNEL_RAW(rgb0_bgr0)
-
+LANCZOS_KERNELS_RGB(rgb0)
+LANCZOS_KERNELS_RGB(bgr0)
+LANCZOS_KERNELS_RGB(rgba)
+LANCZOS_KERNELS_RGB(bgra)
 }