diff mbox

[FFmpeg-devel,1/2] build: add support for building CUDA files with clang

Message ID 20190730075143.40520-1-rodger.combs@gmail.com
State New
Headers show

Commit Message

Rodger Combs July 30, 2019, 7:51 a.m. UTC
This avoids using the CUDA SDK at all; instead, we provide a minimal
reimplementation of the basic functionality that lavfi actually uses.
It generates very similar code to what NVCC produces.

The header contains no implementation code derived from the SDK.
The function and type declarations are derived from the SDK only to the
extent required to build a compatible implementation. This is generally
accepted to qualify as fair use.

Because this option does not require the proprietary SDK, it does not require
the "--enable-nonfree" flag in configure.
---
 compat/cuda/cuda_runtime.h | 131 +++++++++++++++++++++++++++++++++++++
 configure                  |  49 +++++++++-----
 ffbuild/common.mak         |   3 +-
 3 files changed, 166 insertions(+), 17 deletions(-)
 create mode 100644 compat/cuda/cuda_runtime.h

Comments

Philip Langdale July 30, 2019, 8:18 a.m. UTC | #1
On 2019-07-30 15:51, Rodger Combs wrote:
> This avoids using the CUDA SDK at all; instead, we provide a minimal
> reimplementation of the basic functionality that lavfi actually uses.
> It generates very similar code to what NVCC produces.
> 
> The header contains no implementation code derived from the SDK.
> The function and type declarations are derived from the SDK only to the
> extent required to build a compatible implementation. This is generally
> accepted to qualify as fair use.
> 
> Because this option does not require the proprietary SDK, it does not 
> require
> the "--enable-nonfree" flag in configure.
> ---

This is awesome. Thanks so much for doing it. I'm not in a position to 
test, and
won't be for a couple of weeks, but I'm happy with you just pushing it - 
it's
strictly no worse than what we have today.

--phil
Jean-Baptiste Kempf July 30, 2019, 1:41 p.m. UTC | #2
On Tue, Jul 30, 2019, at 09:57, Rodger Combs wrote:
> This avoids using the CUDA SDK at all; instead, we provide a minimal
> reimplementation of the basic functionality that lavfi actually uses.
> It generates very similar code to what NVCC produces.

Very very cool.
Timo Rothenpieler July 30, 2019, 1:48 p.m. UTC | #3
On 30.07.2019 09:51, Rodger Combs wrote:
> This avoids using the CUDA SDK at all; instead, we provide a minimal
> reimplementation of the basic functionality that lavfi actually uses.
> It generates very similar code to what NVCC produces.
> 
> The header contains no implementation code derived from the SDK.
> The function and type declarations are derived from the SDK only to the
> extent required to build a compatible implementation. This is generally
> accepted to qualify as fair use.
> 
> Because this option does not require the proprietary SDK, it does not require
> the "--enable-nonfree" flag in configure.
> ---
>   compat/cuda/cuda_runtime.h | 131 +++++++++++++++++++++++++++++++++++++
>   configure                  |  49 +++++++++-----
>   ffbuild/common.mak         |   3 +-
>   3 files changed, 166 insertions(+), 17 deletions(-)
>   create mode 100644 compat/cuda/cuda_runtime.h
> 
> diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h
> new file mode 100644
> index 0000000000..dbe50f8711
> --- /dev/null
> +++ b/compat/cuda/cuda_runtime.h
> @@ -0,0 +1,131 @@
> +/*
> + * Minimum CUDA compatibility definitions header
> + *
> + * Copyright (c) 2019 Rodger Combs
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AV_COMPAT_CUDA_CUDA_RUNTIME_H
> +#define AV_COMPAT_CUDA_CUDA_RUNTIME_H
> +
> +// Common macros
> +#define __global__ __attribute__((global))
> +#define __device__ __attribute__((device))
> +#define __device_builtin__ __attribute__((device_builtin))
> +#define __align__(N) __attribute__((aligned(N)))
> +#define __inline__ __inline__ __attribute__((always_inline))
> +
> +#define max(a, b) ((a) > (b) ? (a) : (b))
> +#define min(a, b) ((a) < (b) ? (a) : (b))
> +#define abs(x) ((x) < 0 ? -(x) : (x))
> +
> +#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
> +
> +// Basic typedefs
> +typedef __device_builtin__ unsigned long long cudaTextureObject_t;
> +
> +typedef struct __device_builtin__ __align__(2) uchar2
> +{
> +    unsigned char x, y;
> +} uchar2;
> +
> +typedef struct __device_builtin__ __align__(4) ushort2
> +{
> +    unsigned short x, y;
> +} ushort2;
> +
> +typedef struct __device_builtin__ uint3
> +{
> +    unsigned int x, y, z;
> +} uint3;
> +
> +typedef struct uint3 dim3;
> +
> +typedef struct __device_builtin__ __align__(8) int2
> +{
> +    int x, y;
> +} int2;
> +
> +typedef struct __device_builtin__ __align__(4) uchar4
> +{
> +    unsigned char x, y, z, w;
> +} uchar4;
> +
> +typedef struct __device_builtin__ __align__(8) ushort4
> +{
> +    unsigned char x, y, z, w;
> +} ushort4;
> +
> +typedef struct __device_builtin__ __align__(16) int4
> +{
> +    int x, y, z, w;
> +} int4;
> +
> +// Accessors for special registers
> +#define GETCOMP(reg, comp) \
> +    asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
> +    ret.comp = tmp;
> +
> +#define GET(name, reg) static inline __device__ uint3 name() {\
> +    uint3 ret; \
> +    unsigned tmp; \
> +    GETCOMP(reg, x) \
> +    GETCOMP(reg, y) \
> +    GETCOMP(reg, z) \
> +    return ret; \
> +}
> +
> +GET(getBlockIdx, ctaid)
> +GET(getBlockDim, ntid)
> +GET(getThreadIdx, tid)
> +
> +// Instead of externs for these registers, we turn access to them into calls into trivial ASM
> +#define blockIdx (getBlockIdx())
> +#define blockDim (getBlockDim())
> +#define threadIdx (getThreadIdx())
> +
> +// Basic initializers (simple macros rather than inline functions)
> +#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
> +#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
> +#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
> +#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
> +
> +// Conversions from the tex instruction's 4-register output to various types
> +#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
> +
> +TEX2D(unsigned char, a & 0xFF)
> +TEX2D(unsigned short, a & 0xFFFF)
> +TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
> +TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
> +TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
> +TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
> +
> +// Template calling tex instruction and converting the output to the selected type
> +template <class T>
> +static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
> +{
> +  T ret;
> +  unsigned ret1, ret2, ret3, ret4;
> +  asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
> +      "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
> +      "l"(texObject), "f"(x), "f"(y));
> +  conv(&ret, ret1, ret2, ret3, ret4);
> +  return ret;
> +}
> +
> +#endif
> diff --git a/configure b/configure
> index b619fd3d4a..bedeb0c4f0 100755
> --- a/configure
> +++ b/configure
> @@ -324,6 +324,7 @@ External library support:
>     --disable-amf            disable AMF video encoding code [autodetect]
>     --disable-audiotoolbox   disable Apple AudioToolbox code [autodetect]
>     --enable-cuda-nvcc       enable Nvidia CUDA compiler [no]
> +  --enable-cuda-llvm       enable CUDA compilation using clang [no]

Maybe we could even autodetect cuda-llvm, and potentially even nvcc, 
assuming we remove it from non-free as well.

>     --disable-cuvid          disable Nvidia CUVID support [autodetect]
>     --disable-d3d11va        disable Microsoft Direct3D 11 video acceleration code [autodetect]
>     --disable-dxva2          disable Microsoft DirectX 9 video acceleration code [autodetect]
> @@ -372,7 +373,7 @@ Toolchain options:
>     --cxx=CXX                use C compiler CXX [$cxx_default]
>     --objcc=OCC              use ObjC compiler OCC [$cc_default]
>     --dep-cc=DEPCC           use dependency generator DEPCC [$cc_default]
> -  --nvcc=NVCC              use Nvidia CUDA compiler NVCC [$nvcc_default]
> +  --nvcc=NVCC              use Nvidia CUDA compiler NVCC or clang [$nvcc_default]
>     --ld=LD                  use linker LD [$ld_default]
>     --pkg-config=PKGCONFIG   use pkg-config tool PKGCONFIG [$pkg_config_default]
>     --pkg-config-flags=FLAGS pass additional flags to pkgconf []
> @@ -1201,7 +1202,7 @@ test_nvcc(){
>       tmpcu_=$TMPCU
>       tmpo_=$TMPO
>       [ -x "$(command -v cygpath)" ] && tmpcu_=$(cygpath -m $tmpcu_) && tmpo_=$(cygpath -m $tmpo_)
> -    test_cmd $nvcc -ptx $NVCCFLAGS "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
> +    test_cmd $nvcc $nvccflags "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
>   }
>   
>   check_nvcc() {
> @@ -2007,6 +2008,7 @@ HWACCEL_LIBRARY_NONFREE_LIST="
>   
>   HWACCEL_LIBRARY_LIST="
>       $HWACCEL_LIBRARY_NONFREE_LIST
> +    cuda_llvm

With this being a thing, I don't see a reason for cuda_nvcc being 
non-free anymore. I'd actually prefer for it to be a single 
auto-detected configure flag, keeping the nvcc name for compatiblity.
Not sure if nvcc or clang should be preferred. I'm inclined to say 
prefer nvcc, fall back to clang, and keep the enable-cuda-llvm to 
explicitly select clang.


>       libmfx
>       mmal
>       omx
> @@ -3161,8 +3163,10 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>   
>   hwupload_cuda_filter_deps="ffnvcodec"
>   scale_npp_filter_deps="ffnvcodec libnpp"
> -scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
> -thumbnail_cuda_filter_deps="ffnvcodec cuda_nvcc"
> +scale_cuda_filter_deps="ffnvcodec"
> +scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
> +thumbnail_cuda_filter_deps="ffnvcodec"
> +thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>   transpose_npp_filter_deps="ffnvcodec libnpp"
>   
>   amf_deps_any="libdl LoadLibrary"
> @@ -3764,7 +3768,8 @@ zscale_filter_deps="libzimg const_nan"
>   scale_vaapi_filter_deps="vaapi"
>   vpp_qsv_filter_deps="libmfx"
>   vpp_qsv_filter_select="qsvvpp"
> -yadif_cuda_filter_deps="ffnvcodec cuda_nvcc"
> +yadif_cuda_filter_deps="ffnvcodec"
> +yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>   
>   inlineass_filter_deps="avcodec libass"
>   
> @@ -4450,6 +4455,13 @@ windres_default="${cross_prefix}${windres_default}"
>   
>   sysinclude_default="${sysroot}/usr/include"
>   
> +if enabled cuda_llvm; then
> +    enabled cuda_nvcc && die "Cannot enable both cuda-nvcc and cuda-llvm"
> +    nvcc_default="clang"
> +    nvccflags_default="--cuda-gpu-arch=sm_30 -O2"
> +    NVCC_C=""
> +fi
> +
>   set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \
>       target_exec x86asmexe nvcc
>   enabled cross_compile || host_cc_default=$cc
> @@ -6303,6 +6315,22 @@ if enabled cuda_sdk; then
>       enable cuda_nvcc
>   fi
>   
> +if [ -z "$nvccflags" ]; then
> +    nvccflags=$nvccflags_default
> +fi
> +
> +if enabled x86_64 || enabled ppc64 || enabled aarch64; then
> +    nvccflags="$nvccflags -m64"
> +else
> +    nvccflags="$nvccflags -m32"
> +fi
> +
> +if enabled cuda_llvm; then
> +    nvccflags="$nvccflags -S -nocudalib -nocudainc --cuda-device-only -include${source_link}/compat/cuda/cuda_runtime.h"
> +else
> +    nvccflags="$nvccflags -ptx"
> +fi
> +
>   if ! disabled ffnvcodec; then
>       ffnv_hdr_list="ffnvcodec/nvEncodeAPI.h ffnvcodec/dynlink_cuda.h ffnvcodec/dynlink_cuviddec.h ffnvcodec/dynlink_nvcuvid.h"
>       check_pkg_config ffnvcodec "ffnvcodec >= 9.0.18.0" "$ffnv_hdr_list" "" || \
> @@ -6379,6 +6407,7 @@ done
>   
>   # these are off by default, so fail if requested and not available
>   enabled cuda_nvcc         && { check_nvcc || die "ERROR: failed checking for nvcc."; }
> +enabled cuda_llvm         && { check_nvcc || die "ERROR: failed checking for clang for CUDA."; }
>   enabled chromaprint       && require chromaprint chromaprint.h chromaprint_get_version -lchromaprint
>   enabled decklink          && { require_headers DeckLinkAPI.h &&
>                                  { test_cpp_condition DeckLinkAPIVersion.h "BLACKMAGIC_DECKLINK_API_VERSION >= 0x0a090500" || die "ERROR: Decklink API version must be >= 10.9.5."; } }
> @@ -6931,16 +6960,6 @@ if [ -z "$optflags" ]; then
>       fi
>   fi
>   
> -if [ -z "$nvccflags" ]; then
> -    nvccflags=$nvccflags_default
> -fi
> -
> -if enabled x86_64 || enabled ppc64 || enabled aarch64; then
> -    nvccflags="$nvccflags -m64"
> -else
> -    nvccflags="$nvccflags -m32"
> -fi
> -
>   check_optflags(){
>       check_cflags "$@"
>       enabled lto && check_ldflags "$@"
> diff --git a/ffbuild/common.mak b/ffbuild/common.mak
> index 8770e45a1a..28ecc23e42 100644
> --- a/ffbuild/common.mak
> +++ b/ffbuild/common.mak
> @@ -38,7 +38,6 @@ OBJCCFLAGS  = $(CPPFLAGS) $(CFLAGS) $(OBJCFLAGS)
>   ASFLAGS    := $(CPPFLAGS) $(ASFLAGS)
>   CXXFLAGS   := $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS)
>   X86ASMFLAGS += $(IFLAGS:%=%/) -I$(<D)/ -Pconfig.asm
> -NVCCFLAGS  += -ptx
>   
>   HOSTCCFLAGS = $(IFLAGS) $(HOSTCPPFLAGS) $(HOSTCFLAGS)
>   LDFLAGS    := $(ALLFFLIBS:%=$(LD_PATH)lib%) $(LDFLAGS)
> @@ -91,7 +90,7 @@ COMPILE_NVCC = $(call COMPILE,NVCC)
>   %.h.c:
>   	$(Q)echo '#include "$*.h"' >$@
>   
> -%.ptx: %.cu
> +%.ptx: %.cu $(SRC_PATH)/compat/cuda/cuda_runtime.h
>   	$(COMPILE_NVCC)
>   
>   %.ptx.c: %.ptx
> 

Otherwise this looks very good to me!
Timo Rothenpieler Aug. 4, 2019, 5:23 p.m. UTC | #4
On 30.07.2019 09:51, Rodger Combs wrote:
> This avoids using the CUDA SDK at all; instead, we provide a minimal
> reimplementation of the basic functionality that lavfi actually uses.
> It generates very similar code to what NVCC produces.
> 
> The header contains no implementation code derived from the SDK.
> The function and type declarations are derived from the SDK only to the
> extent required to build a compatible implementation. This is generally
> accepted to qualify as fair use.
> 
> Because this option does not require the proprietary SDK, it does not require
> the "--enable-nonfree" flag in configure.

Changed configure logic to autodetect cuda-llvm and applied.
diff mbox

Patch

diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h
new file mode 100644
index 0000000000..dbe50f8711
--- /dev/null
+++ b/compat/cuda/cuda_runtime.h
@@ -0,0 +1,131 @@ 
+/*
+ * Minimum CUDA compatibility definitions header
+ *
+ * Copyright (c) 2019 Rodger Combs
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AV_COMPAT_CUDA_CUDA_RUNTIME_H
+#define AV_COMPAT_CUDA_CUDA_RUNTIME_H
+
+// Common macros
+#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
+#define __device_builtin__ __attribute__((device_builtin))
+#define __align__(N) __attribute__((aligned(N)))
+#define __inline__ __inline__ __attribute__((always_inline))
+
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define abs(x) ((x) < 0 ? -(x) : (x))
+
+#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
+
+// Basic typedefs
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+typedef struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+} uchar2;
+
+typedef struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+} ushort2;
+
+typedef struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+} uint3;
+
+typedef struct uint3 dim3;
+
+typedef struct __device_builtin__ __align__(8) int2
+{
+    int x, y;
+} int2;
+
+typedef struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+} uchar4;
+
+typedef struct __device_builtin__ __align__(8) ushort4
+{
+    unsigned char x, y, z, w;
+} ushort4;
+
+typedef struct __device_builtin__ __align__(16) int4
+{
+    int x, y, z, w;
+} int4;
+
+// Accessors for special registers
+#define GETCOMP(reg, comp) \
+    asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
+    ret.comp = tmp;
+
+#define GET(name, reg) static inline __device__ uint3 name() {\
+    uint3 ret; \
+    unsigned tmp; \
+    GETCOMP(reg, x) \
+    GETCOMP(reg, y) \
+    GETCOMP(reg, z) \
+    return ret; \
+}
+
+GET(getBlockIdx, ctaid)
+GET(getBlockDim, ntid)
+GET(getThreadIdx, tid)
+
+// Instead of externs for these registers, we turn access to them into calls into trivial ASM
+#define blockIdx (getBlockIdx())
+#define blockDim (getBlockDim())
+#define threadIdx (getThreadIdx())
+
+// Basic initializers (simple macros rather than inline functions)
+#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
+#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
+#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
+#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
+
+// Conversions from the tex instruction's 4-register output to various types
+#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
+
+TEX2D(unsigned char, a & 0xFF)
+TEX2D(unsigned short, a & 0xFFFF)
+TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
+TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
+TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
+TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
+
+// Template calling tex instruction and converting the output to the selected type
+template <class T>
+static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+  T ret;
+  unsigned ret1, ret2, ret3, ret4;
+  asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
+      "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
+      "l"(texObject), "f"(x), "f"(y));
+  conv(&ret, ret1, ret2, ret3, ret4);
+  return ret;
+}
+
+#endif
diff --git a/configure b/configure
index b619fd3d4a..bedeb0c4f0 100755
--- a/configure
+++ b/configure
@@ -324,6 +324,7 @@  External library support:
   --disable-amf            disable AMF video encoding code [autodetect]
   --disable-audiotoolbox   disable Apple AudioToolbox code [autodetect]
   --enable-cuda-nvcc       enable Nvidia CUDA compiler [no]
+  --enable-cuda-llvm       enable CUDA compilation using clang [no]
   --disable-cuvid          disable Nvidia CUVID support [autodetect]
   --disable-d3d11va        disable Microsoft Direct3D 11 video acceleration code [autodetect]
   --disable-dxva2          disable Microsoft DirectX 9 video acceleration code [autodetect]
@@ -372,7 +373,7 @@  Toolchain options:
   --cxx=CXX                use C compiler CXX [$cxx_default]
   --objcc=OCC              use ObjC compiler OCC [$cc_default]
   --dep-cc=DEPCC           use dependency generator DEPCC [$cc_default]
-  --nvcc=NVCC              use Nvidia CUDA compiler NVCC [$nvcc_default]
+  --nvcc=NVCC              use Nvidia CUDA compiler NVCC or clang [$nvcc_default]
   --ld=LD                  use linker LD [$ld_default]
   --pkg-config=PKGCONFIG   use pkg-config tool PKGCONFIG [$pkg_config_default]
   --pkg-config-flags=FLAGS pass additional flags to pkgconf []
@@ -1201,7 +1202,7 @@  test_nvcc(){
     tmpcu_=$TMPCU
     tmpo_=$TMPO
     [ -x "$(command -v cygpath)" ] && tmpcu_=$(cygpath -m $tmpcu_) && tmpo_=$(cygpath -m $tmpo_)
-    test_cmd $nvcc -ptx $NVCCFLAGS "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
+    test_cmd $nvcc $nvccflags "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
 }
 
 check_nvcc() {
@@ -2007,6 +2008,7 @@  HWACCEL_LIBRARY_NONFREE_LIST="
 
 HWACCEL_LIBRARY_LIST="
     $HWACCEL_LIBRARY_NONFREE_LIST
+    cuda_llvm
     libmfx
     mmal
     omx
@@ -3161,8 +3163,10 @@  v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
 
 hwupload_cuda_filter_deps="ffnvcodec"
 scale_npp_filter_deps="ffnvcodec libnpp"
-scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
-thumbnail_cuda_filter_deps="ffnvcodec cuda_nvcc"
+scale_cuda_filter_deps="ffnvcodec"
+scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+thumbnail_cuda_filter_deps="ffnvcodec"
+thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 
 amf_deps_any="libdl LoadLibrary"
@@ -3764,7 +3768,8 @@  zscale_filter_deps="libzimg const_nan"
 scale_vaapi_filter_deps="vaapi"
 vpp_qsv_filter_deps="libmfx"
 vpp_qsv_filter_select="qsvvpp"
-yadif_cuda_filter_deps="ffnvcodec cuda_nvcc"
+yadif_cuda_filter_deps="ffnvcodec"
+yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 inlineass_filter_deps="avcodec libass"
 
@@ -4450,6 +4455,13 @@  windres_default="${cross_prefix}${windres_default}"
 
 sysinclude_default="${sysroot}/usr/include"
 
+if enabled cuda_llvm; then
+    enabled cuda_nvcc && die "Cannot enable both cuda-nvcc and cuda-llvm"
+    nvcc_default="clang"
+    nvccflags_default="--cuda-gpu-arch=sm_30 -O2"
+    NVCC_C=""
+fi
+
 set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \
     target_exec x86asmexe nvcc
 enabled cross_compile || host_cc_default=$cc
@@ -6303,6 +6315,22 @@  if enabled cuda_sdk; then
     enable cuda_nvcc
 fi
 
+if [ -z "$nvccflags" ]; then
+    nvccflags=$nvccflags_default
+fi
+
+if enabled x86_64 || enabled ppc64 || enabled aarch64; then
+    nvccflags="$nvccflags -m64"
+else
+    nvccflags="$nvccflags -m32"
+fi
+
+if enabled cuda_llvm; then
+    nvccflags="$nvccflags -S -nocudalib -nocudainc --cuda-device-only -include${source_link}/compat/cuda/cuda_runtime.h"
+else
+    nvccflags="$nvccflags -ptx"
+fi
+
 if ! disabled ffnvcodec; then
     ffnv_hdr_list="ffnvcodec/nvEncodeAPI.h ffnvcodec/dynlink_cuda.h ffnvcodec/dynlink_cuviddec.h ffnvcodec/dynlink_nvcuvid.h"
     check_pkg_config ffnvcodec "ffnvcodec >= 9.0.18.0" "$ffnv_hdr_list" "" || \
@@ -6379,6 +6407,7 @@  done
 
 # these are off by default, so fail if requested and not available
 enabled cuda_nvcc         && { check_nvcc || die "ERROR: failed checking for nvcc."; }
+enabled cuda_llvm         && { check_nvcc || die "ERROR: failed checking for clang for CUDA."; }
 enabled chromaprint       && require chromaprint chromaprint.h chromaprint_get_version -lchromaprint
 enabled decklink          && { require_headers DeckLinkAPI.h &&
                                { test_cpp_condition DeckLinkAPIVersion.h "BLACKMAGIC_DECKLINK_API_VERSION >= 0x0a090500" || die "ERROR: Decklink API version must be >= 10.9.5."; } }
@@ -6931,16 +6960,6 @@  if [ -z "$optflags" ]; then
     fi
 fi
 
-if [ -z "$nvccflags" ]; then
-    nvccflags=$nvccflags_default
-fi
-
-if enabled x86_64 || enabled ppc64 || enabled aarch64; then
-    nvccflags="$nvccflags -m64"
-else
-    nvccflags="$nvccflags -m32"
-fi
-
 check_optflags(){
     check_cflags "$@"
     enabled lto && check_ldflags "$@"
diff --git a/ffbuild/common.mak b/ffbuild/common.mak
index 8770e45a1a..28ecc23e42 100644
--- a/ffbuild/common.mak
+++ b/ffbuild/common.mak
@@ -38,7 +38,6 @@  OBJCCFLAGS  = $(CPPFLAGS) $(CFLAGS) $(OBJCFLAGS)
 ASFLAGS    := $(CPPFLAGS) $(ASFLAGS)
 CXXFLAGS   := $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS)
 X86ASMFLAGS += $(IFLAGS:%=%/) -I$(<D)/ -Pconfig.asm
-NVCCFLAGS  += -ptx
 
 HOSTCCFLAGS = $(IFLAGS) $(HOSTCPPFLAGS) $(HOSTCFLAGS)
 LDFLAGS    := $(ALLFFLIBS:%=$(LD_PATH)lib%) $(LDFLAGS)
@@ -91,7 +90,7 @@  COMPILE_NVCC = $(call COMPILE,NVCC)
 %.h.c:
 	$(Q)echo '#include "$*.h"' >$@
 
-%.ptx: %.cu
+%.ptx: %.cu $(SRC_PATH)/compat/cuda/cuda_runtime.h
 	$(COMPILE_NVCC)
 
 %.ptx.c: %.ptx