diff mbox series

[FFmpeg-devel,v2] avfilter: compress CUDA PTX code if possible

Message ID 20210612164750.14944-1-timo@rothenpieler.org
State Accepted
Headers show
Series [FFmpeg-devel,v2] avfilter: compress CUDA PTX code if possible
Related show

Checks

Context Check Description
andriy/configure warning Failed to apply patch
andriy/configure warning Failed to apply patch

Commit Message

Timo Rothenpieler June 12, 2021, 4:47 p.m. UTC
---
 .gitignore                      |  1 +
 compat/cuda/ptx2c.sh            | 34 ------------
 configure                       | 17 ++++++
 ffbuild/.gitignore              |  1 +
 ffbuild/bin2c.c                 | 76 ++++++++++++++++++++++++++
 ffbuild/common.mak              | 28 ++++++++--
 libavfilter/Makefile            | 14 +++--
 libavfilter/cuda/load_helper.c  | 96 +++++++++++++++++++++++++++++++++
 libavfilter/cuda/load_helper.h  | 28 ++++++++++
 libavfilter/vf_format_cuda.c    |  7 ++-
 libavfilter/vf_overlay_cuda.c   |  8 +--
 libavfilter/vf_scale_cuda.c     | 24 ++++++---
 libavfilter/vf_thumbnail_cuda.c |  7 ++-
 libavfilter/vf_yadif_cuda.c     |  7 ++-
 14 files changed, 287 insertions(+), 61 deletions(-)
 delete mode 100755 compat/cuda/ptx2c.sh
 create mode 100644 ffbuild/bin2c.c
 create mode 100644 libavfilter/cuda/load_helper.c
 create mode 100644 libavfilter/cuda/load_helper.h

Comments

Philip Langdale June 18, 2021, 8:45 p.m. UTC | #1
On Sat, 12 Jun 2021 18:47:50 +0200
Timo Rothenpieler <timo@rothenpieler.org> wrote:

> ---
>  .gitignore                      |  1 +
>  compat/cuda/ptx2c.sh            | 34 ------------
>  configure                       | 17 ++++++
>  ffbuild/.gitignore              |  1 +
>  ffbuild/bin2c.c                 | 76 ++++++++++++++++++++++++++
>  ffbuild/common.mak              | 28 ++++++++--
>  libavfilter/Makefile            | 14 +++--
>  libavfilter/cuda/load_helper.c  | 96
> +++++++++++++++++++++++++++++++++ libavfilter/cuda/load_helper.h  |
> 28 ++++++++++ libavfilter/vf_format_cuda.c    |  7 ++-
>  libavfilter/vf_overlay_cuda.c   |  8 +--
>  libavfilter/vf_scale_cuda.c     | 24 ++++++---
>  libavfilter/vf_thumbnail_cuda.c |  7 ++-
>  libavfilter/vf_yadif_cuda.c     |  7 ++-
>  14 files changed, 287 insertions(+), 61 deletions(-)
>  delete mode 100755 compat/cuda/ptx2c.sh
>  create mode 100644 ffbuild/bin2c.c
>  create mode 100644 libavfilter/cuda/load_helper.c
>  create mode 100644 libavfilter/cuda/load_helper.h

I just had comments about one file: 
 
> diff --git a/libavfilter/cuda/load_helper.c
> b/libavfilter/cuda/load_helper.c new file mode 100644
> index 0000000000..62d644c29a
> --- /dev/null
> +++ b/libavfilter/cuda/load_helper.c
> @@ -0,0 +1,96 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#include "libavutil/cuda_check.h"
> +
> +#if CONFIG_PTX_COMPRESSION
> +#include <zlib.h>
> +#define CHUNK_SIZE 1024 * 64
> +#endif
> +
> +#include "load_helper.h"
> +
> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(avctx, cu, x)
> +
> +int ff_cuda_load_module(void *avctx, AVCUDADeviceContext *hwctx,
> CUmodule *cu_module,
> +                        const unsigned char *data, const unsigned
> int length) +{
> +    CudaFunctions *cu = hwctx->internal->cuda_dl;
> +
> +#if CONFIG_PTX_COMPRESSION
> +    z_stream stream = { 0 };
> +    uint8_t *buf, *tmp;
> +    uint64_t buf_size;
> +    int ret;
> +
> +    if (inflateInit2(&stream, 32 + 15) != Z_OK) {

Can you add a comment explaining the magic numbers?

> +        av_log(avctx, AV_LOG_ERROR, "Error during zlib
> initialisation: %s\n", stream.msg);
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    buf_size = CHUNK_SIZE * 4;
> +    buf = av_realloc(NULL, buf_size);
> +    if (!buf) {
> +        inflateEnd(&stream);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    stream.next_in = data;
> +    stream.avail_in = length;
> +
> +    do {
> +        stream.avail_out = buf_size - stream.total_out;
> +        stream.next_out = buf + stream.total_out;
> +
> +        ret = inflate(&stream, Z_FINISH);
> +        if (ret != Z_OK && ret != Z_STREAM_END) {
> +            av_log(avctx, AV_LOG_ERROR, "zlib inflate error: %s\n",
> stream.msg);
> +            inflateEnd(&stream);
> +            av_free(buf);
> +            return AVERROR(EINVAL);
> +        }
> +
> +        if (stream.avail_out == 0) {
> +            buf_size += CHUNK_SIZE;
> +            tmp = av_realloc(buf, buf_size);
> +            if (!tmp) {
> +                inflateEnd(&stream);
> +                av_free(buf);
> +                return AVERROR(ENOMEM);
> +            }
> +            buf = tmp;
> +        }
> +    } while (ret != Z_STREAM_END);
> +
> +    // NULL-terminate string
> +    // there is guaranteed to be space for this, due to condition in
> loop

This is because it will still grow the buffer if avail_out is zero at
the time you hit Z_STREAM_END?

> +    buf[stream.total_out] = 0;
> +
> +    inflateEnd(&stream);
> +
> +    ret = CHECK_CU(cu->cuModuleLoadData(cu_module, buf));
> +    av_free(buf);
> +    return ret;
> +#else
> +    return CHECK_CU(cu->cuModuleLoadData(cu_module, data));
> +#endif
> +}

Otherwise, LGTM.

Thanks,

--phil
Timo Rothenpieler June 18, 2021, 9:07 p.m. UTC | #2
On 18.06.2021 22:45, Philip Langdale wrote:
> On Sat, 12 Jun 2021 18:47:50 +0200
> Timo Rothenpieler <timo@rothenpieler.org> wrote:
> 
>> ---
>>   .gitignore                      |  1 +
>>   compat/cuda/ptx2c.sh            | 34 ------------
>>   configure                       | 17 ++++++
>>   ffbuild/.gitignore              |  1 +
>>   ffbuild/bin2c.c                 | 76 ++++++++++++++++++++++++++
>>   ffbuild/common.mak              | 28 ++++++++--
>>   libavfilter/Makefile            | 14 +++--
>>   libavfilter/cuda/load_helper.c  | 96
>> +++++++++++++++++++++++++++++++++ libavfilter/cuda/load_helper.h  |
>> 28 ++++++++++ libavfilter/vf_format_cuda.c    |  7 ++-
>>   libavfilter/vf_overlay_cuda.c   |  8 +--
>>   libavfilter/vf_scale_cuda.c     | 24 ++++++---
>>   libavfilter/vf_thumbnail_cuda.c |  7 ++-
>>   libavfilter/vf_yadif_cuda.c     |  7 ++-
>>   14 files changed, 287 insertions(+), 61 deletions(-)
>>   delete mode 100755 compat/cuda/ptx2c.sh
>>   create mode 100644 ffbuild/bin2c.c
>>   create mode 100644 libavfilter/cuda/load_helper.c
>>   create mode 100644 libavfilter/cuda/load_helper.h
> 
> I just had comments about one file:
>   
>> diff --git a/libavfilter/cuda/load_helper.c
>> b/libavfilter/cuda/load_helper.c new file mode 100644
>> index 0000000000..62d644c29a
>> --- /dev/null
>> +++ b/libavfilter/cuda/load_helper.c
>> @@ -0,0 +1,96 @@
>> +/*
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "config.h"
>> +
>> +#include "libavutil/hwcontext.h"
>> +#include "libavutil/hwcontext_cuda_internal.h"
>> +#include "libavutil/cuda_check.h"
>> +
>> +#if CONFIG_PTX_COMPRESSION
>> +#include <zlib.h>
>> +#define CHUNK_SIZE 1024 * 64
>> +#endif
>> +
>> +#include "load_helper.h"
>> +
>> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(avctx, cu, x)
>> +
>> +int ff_cuda_load_module(void *avctx, AVCUDADeviceContext *hwctx,
>> CUmodule *cu_module,
>> +                        const unsigned char *data, const unsigned
>> int length) +{
>> +    CudaFunctions *cu = hwctx->internal->cuda_dl;
>> +
>> +#if CONFIG_PTX_COMPRESSION
>> +    z_stream stream = { 0 };
>> +    uint8_t *buf, *tmp;
>> +    uint64_t buf_size;
>> +    int ret;
>> +
>> +    if (inflateInit2(&stream, 32 + 15) != Z_OK) {
> 
> Can you add a comment explaining the magic numbers?

I have no idea what those numbers do, I copied them from http.c and they 
work.

>> +        av_log(avctx, AV_LOG_ERROR, "Error during zlib
>> initialisation: %s\n", stream.msg);
>> +        return AVERROR(ENOSYS);
>> +    }
>> +
>> +    buf_size = CHUNK_SIZE * 4;
>> +    buf = av_realloc(NULL, buf_size);
>> +    if (!buf) {
>> +        inflateEnd(&stream);
>> +        return AVERROR(ENOMEM);
>> +    }
>> +
>> +    stream.next_in = data;
>> +    stream.avail_in = length;
>> +
>> +    do {
>> +        stream.avail_out = buf_size - stream.total_out;
>> +        stream.next_out = buf + stream.total_out;
>> +
>> +        ret = inflate(&stream, Z_FINISH);
>> +        if (ret != Z_OK && ret != Z_STREAM_END) {
>> +            av_log(avctx, AV_LOG_ERROR, "zlib inflate error: %s\n",
>> stream.msg);
>> +            inflateEnd(&stream);
>> +            av_free(buf);
>> +            return AVERROR(EINVAL);
>> +        }
>> +
>> +        if (stream.avail_out == 0) {
>> +            buf_size += CHUNK_SIZE;
>> +            tmp = av_realloc(buf, buf_size);
>> +            if (!tmp) {
>> +                inflateEnd(&stream);
>> +                av_free(buf);
>> +                return AVERROR(ENOMEM);
>> +            }
>> +            buf = tmp;
>> +        }
>> +    } while (ret != Z_STREAM_END);
>> +
>> +    // NULL-terminate string
>> +    // there is guaranteed to be space for this, due to condition in
>> loop
> 
> This is because it will still grow the buffer if avail_out is zero at
> the time you hit Z_STREAM_END?

If avail_out was 0, the condition right above it would have grown it, so 
it has at least one byte free.
Philip Langdale June 19, 2021, 6:55 p.m. UTC | #3
On Fri, 18 Jun 2021 23:07:58 +0200
Timo Rothenpieler <timo@rothenpieler.org> wrote:

> >> +
> >> +    if (inflateInit2(&stream, 32 + 15) != Z_OK) {  
> > 
> > Can you add a comment explaining the magic numbers?  
> 
> I have no idea what those numbers do, I copied them from http.c and
> they work.

Heh.

So, I read this: https://www.zlib.net/manual.html

> windowBits can also be greater than 15 for optional gzip decoding.
> Add 32 to windowBits to enable zlib and gzip decoding with automatic
> header detection, or add 16 to decode only the gzip format (the zlib
> format will return a Z_DATA_ERROR). If a gzip stream is being
> decoded, strm->adler is a CRC-32 instead of an Adler-32. Unlike the
> gunzip utility and gzread() (see below), inflate() will not
> automatically decode concatenated gzip streams. inflate() will return
> Z_STREAM_END at the end of the gzip stream. The state would need to
> be reset to continue decoding a subsequent gzip stream.
 
I think that means it is necessary to pass '32 + 15' although you might
do '32 + 16' because you know that the files were really gzipped.

--phil
diff mbox series

Patch

diff --git a/.gitignore b/.gitignore
index 2450ee8fc5..9ed24b542e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ 
 *.version
 *.ptx
 *.ptx.c
+*.ptx.gz
 *_g
 \#*
 .\#*
diff --git a/compat/cuda/ptx2c.sh b/compat/cuda/ptx2c.sh
deleted file mode 100755
index 48452379c2..0000000000
--- a/compat/cuda/ptx2c.sh
+++ /dev/null
@@ -1,34 +0,0 @@ 
-#!/bin/sh
-
-# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-set -e
-
-OUT="$1"
-IN="$2"
-NAME="$(basename "$IN" | sed 's/\..*//')"
-
-printf "const char %s_ptx[] = \\" "$NAME" > "$OUT"
-echo >> "$OUT"
-sed -e "$(printf 's/\r//g')" -e 's/["\\]/\\&/g' -e "$(printf 's/^/\t"/')" -e 's/$/\\n"/' < "$IN" >> "$OUT"
-echo ";" >> "$OUT"
-
-exit 0
diff --git a/configure b/configure
index 58e3098a22..41449eac5f 100755
--- a/configure
+++ b/configure
@@ -489,6 +489,7 @@  Developer options (useful when working on FFmpeg itself):
                            in the name) of tests whose result is ignored
   --enable-linux-perf      enable Linux Performance Monitor API
   --disable-large-tests    disable tests that use a large amount of memory
+  --disable-ptx-compression don't compress CUDA PTX code even when possible
 
 NOTE: Object files are built at the place where configure is launched.
 EOF
@@ -1980,6 +1981,7 @@  CONFIG_LIST="
     neon_clobber_test
     ossfuzz
     pic
+    ptx_compression
     thumb
     valgrind_backtrace
     xmm_clobber_test
@@ -2355,6 +2357,7 @@  HAVE_LIST="
     $THREADS_LIST
     $TOOLCHAIN_FEATURES
     $TYPES_LIST
+    gzip
     libdrm_getfb2
     makeinfo
     makeinfo_html
@@ -2367,6 +2370,7 @@  HAVE_LIST="
     perl
     pod2man
     texi2html
+    zlib_gzip
 "
 
 # options emitted with CONFIG_ prefix but not available on the command line
@@ -3838,6 +3842,7 @@  enable doc
 enable faan faandct faanidct
 enable large_tests
 enable optimizations
+enable ptx_compression
 enable runtime_cpudetect
 enable safe_bitstream_reader
 enable static
@@ -6348,6 +6353,18 @@  enabled  zlib && { check_pkg_config zlib zlib "zlib.h" zlibVersion ||
 enabled bzlib && check_lib bzlib bzlib.h BZ2_bzlibVersion    -lbz2
 enabled  lzma && check_lib lzma   lzma.h lzma_version_number -llzma
 
+enabled zlib && test_exec $zlib_extralibs <<EOF && enable zlib_gzip
+#include <zlib.h>
+int main(void) {
+    if (zlibCompileFlags() & (1 << 17)) return 1;
+    return 0;
+}
+EOF
+
+[ -x "$(command -v gzip)" ] && enable gzip
+
+enabled zlib_gzip && enabled gzip || disable ptx_compression
+
 # On some systems dynamic loading requires no extra linker flags
 check_lib libdl dlfcn.h "dlopen dlsym" || check_lib libdl dlfcn.h "dlopen dlsym" -ldl
 
diff --git a/ffbuild/.gitignore b/ffbuild/.gitignore
index 38ed170752..2a70bace43 100644
--- a/ffbuild/.gitignore
+++ b/ffbuild/.gitignore
@@ -1,4 +1,5 @@ 
 /.config
+/bin2c
 /config.fate
 /config.log
 /config.mak
diff --git a/ffbuild/bin2c.c b/ffbuild/bin2c.c
new file mode 100644
index 0000000000..dfeedd7669
--- /dev/null
+++ b/ffbuild/bin2c.c
@@ -0,0 +1,76 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+    const char *name;
+    FILE *input, *output;
+    unsigned int length = 0;
+    unsigned char data;
+
+    if (argc < 3 || argc > 4)
+        return 1;
+
+    input = fopen(argv[1], "rb");
+    if (!input)
+        return -1;
+
+    output = fopen(argv[2], "wb");
+    if (!output)
+        return -1;
+
+    if (argc == 4) {
+        name = argv[3];
+    } else {
+        size_t arglen = strlen(argv[1]);
+        name = argv[1];
+
+        for (int i = 0; i < arglen; i++) {
+            if (argv[1][i] == '.')
+                argv[1][i] = '_';
+            else if (argv[1][i] == '/')
+                name = &argv[1][i+1];
+        }
+    }
+
+    fprintf(output, "const unsigned char ff_%s_data[] = { ", name);
+
+    while (fread(&data, 1, 1, input) > 0) {
+        fprintf(output, "0x%02x, ", data);
+        length++;
+    }
+
+    fprintf(output, "0x00 };\n");
+    fprintf(output, "const unsigned int ff_%s_len = %u;\n", name, length);
+
+    fclose(output);
+
+    if (ferror(input) || !feof(input))
+        return -1;
+
+    fclose(input);
+
+    return 0;
+}
diff --git a/ffbuild/common.mak b/ffbuild/common.mak
index 32f5b997b5..a370c0c30c 100644
--- a/ffbuild/common.mak
+++ b/ffbuild/common.mak
@@ -12,10 +12,13 @@  endif
 
 ifndef SUBDIR
 
+BIN2CEXE = ffbuild/bin2c$(HOSTEXESUF)
+BIN2C = $(BIN2CEXE)
+
 ifndef V
 Q      = @
 ECHO   = printf "$(1)\t%s\n" $(2)
-BRIEF  = CC CXX OBJCC HOSTCC HOSTLD AS X86ASM AR LD STRIP CP WINDRES NVCC
+BRIEF  = CC CXX OBJCC HOSTCC HOSTLD AS X86ASM AR LD STRIP CP WINDRES NVCC BIN2C
 SILENT = DEPCC DEPHOSTCC DEPAS DEPX86ASM RANLIB RM
 
 MSG    = $@
@@ -98,11 +101,26 @@  COMPILE_MSA = $(call COMPILE,CC,MSAFLAGS)
 %.h.c:
 	$(Q)echo '#include "$*.h"' >$@
 
+$(BIN2CEXE): ffbuild/bin2c_host.o
+	$(HOSTLD) $(HOSTLDFLAGS) $(HOSTLD_O) $^ $(HOSTEXTRALIBS)
+
 %.ptx: %.cu $(SRC_PATH)/compat/cuda/cuda_runtime.h
 	$(COMPILE_NVCC)
 
-%.ptx.c: %.ptx
-	$(Q)sh $(SRC_PATH)/compat/cuda/ptx2c.sh $@ $(patsubst $(SRC_PATH)/%,$(SRC_LINK)/%,$<)
+ifdef CONFIG_PTX_COMPRESSION
+%.ptx.gz: TAG = GZIP
+%.ptx.gz: %.ptx
+	$(M)gzip -c9 $(patsubst $(SRC_PATH)/%,$(SRC_LINK)/%,$<) >$@
+
+%.ptx.c: %.ptx.gz $(BIN2CEXE)
+	$(BIN2C) $(patsubst $(SRC_PATH)/%,$(SRC_LINK)/%,$<) $@ $(subst .,_,$(basename $(notdir $@)))
+else
+%.ptx.c: %.ptx $(BIN2CEXE)
+	$(BIN2C) $(patsubst $(SRC_PATH)/%,$(SRC_LINK)/%,$<) $@ $(subst .,_,$(basename $(notdir $@)))
+endif
+
+clean::
+	$(RM) $(BIN2CEXE)
 
 %.c %.h %.pc %.ver %.version: TAG = GEN
 
@@ -151,7 +169,7 @@  HOBJS        = $(filter-out $(SKIPHEADERS:.h=.h.o),$(ALLHEADERS:.h=.h.o))
 PTXOBJS      = $(filter %.ptx.o,$(OBJS))
 $(HOBJS):     CCFLAGS += $(CFLAGS_HEADERS)
 checkheaders: $(HOBJS)
-.SECONDARY:   $(HOBJS:.o=.c) $(PTXOBJS:.o=.c) $(PTXOBJS:.o=)
+.SECONDARY:   $(HOBJS:.o=.c) $(PTXOBJS:.o=.c) $(PTXOBJS:.o=.gz) $(PTXOBJS:.o=)
 
 alltools: $(TOOLS)
 
@@ -170,7 +188,7 @@  $(TOOLOBJS): | tools
 
 OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SLIBOBJS) $(TESTOBJS))
 
-CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.pc *.ptx *.ptx.c *.ver *.version *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
+CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.pc *.ptx *.ptx.gz *.ptx.c *.ver *.version *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
 LIBSUFFIXES       = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
 
 define RULES
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 1f8331c4f4..289dc4cbb5 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -273,7 +273,8 @@  OBJS-$(CONFIG_FILLBORDERS_FILTER)            += vf_fillborders.o
 OBJS-$(CONFIG_FIND_RECT_FILTER)              += vf_find_rect.o lavfutils.o
 OBJS-$(CONFIG_FLOODFILL_FILTER)              += vf_floodfill.o
 OBJS-$(CONFIG_FORMAT_FILTER)                 += vf_format.o
-OBJS-$(CONFIG_FORMAT_CUDA_FILTER)            += vf_format_cuda.o vf_format_cuda.ptx.o
+OBJS-$(CONFIG_FORMAT_CUDA_FILTER)            += vf_format_cuda.o vf_format_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_FPS_FILTER)                    += vf_fps.o
 OBJS-$(CONFIG_FRAMEPACK_FILTER)              += vf_framepack.o
 OBJS-$(CONFIG_FRAMERATE_FILTER)              += vf_framerate.o
@@ -350,7 +351,8 @@  OBJS-$(CONFIG_OCR_FILTER)                    += vf_ocr.o
 OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o
 OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync.o
-OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o
+OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
                                                 opencl/overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_QSV_FILTER)            += vf_overlay_qsv.o framesync.o
@@ -395,7 +397,8 @@  OBJS-$(CONFIG_ROTATE_FILTER)                 += vf_rotate.o
 OBJS-$(CONFIG_SAB_FILTER)                    += vf_sab.o
 OBJS-$(CONFIG_SCALE_FILTER)                  += vf_scale.o scale_eval.o
 OBJS-$(CONFIG_SCALE_CUDA_FILTER)             += vf_scale_cuda.o scale_eval.o \
-                                                vf_scale_cuda.ptx.o vf_scale_cuda_bicubic.ptx.o
+                                                vf_scale_cuda.ptx.o vf_scale_cuda_bicubic.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_SCALE_NPP_FILTER)              += vf_scale_npp.o scale_eval.o
 OBJS-$(CONFIG_SCALE_QSV_FILTER)              += vf_scale_qsv.o
 OBJS-$(CONFIG_SCALE_VAAPI_FILTER)            += vf_scale_vaapi.o scale_eval.o vaapi_vpp.o
@@ -443,7 +446,8 @@  OBJS-$(CONFIG_TELECINE_FILTER)               += vf_telecine.o
 OBJS-$(CONFIG_THISTOGRAM_FILTER)             += vf_histogram.o
 OBJS-$(CONFIG_THRESHOLD_FILTER)              += vf_threshold.o framesync.o
 OBJS-$(CONFIG_THUMBNAIL_FILTER)              += vf_thumbnail.o
-OBJS-$(CONFIG_THUMBNAIL_CUDA_FILTER)         += vf_thumbnail_cuda.o vf_thumbnail_cuda.ptx.o
+OBJS-$(CONFIG_THUMBNAIL_CUDA_FILTER)         += vf_thumbnail_cuda.o vf_thumbnail_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_TILE_FILTER)                   += vf_tile.o
 OBJS-$(CONFIG_TINTERLACE_FILTER)             += vf_tinterlace.o
 OBJS-$(CONFIG_TLUT2_FILTER)                  += vf_lut2.o framesync.o
@@ -489,7 +493,7 @@  OBJS-$(CONFIG_XMEDIAN_FILTER)                += vf_xmedian.o framesync.o
 OBJS-$(CONFIG_XSTACK_FILTER)                 += vf_stack.o framesync.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += vf_yadif.o yadif_common.o
 OBJS-$(CONFIG_YADIF_CUDA_FILTER)             += vf_yadif_cuda.o vf_yadif_cuda.ptx.o \
-                                                yadif_common.o
+                                                yadif_common.o cuda/load_helper.o
 OBJS-$(CONFIG_YAEPBLUR_FILTER)               += vf_yaepblur.o
 OBJS-$(CONFIG_ZMQ_FILTER)                    += f_zmq.o
 OBJS-$(CONFIG_ZOOMPAN_FILTER)                += vf_zoompan.o
diff --git a/libavfilter/cuda/load_helper.c b/libavfilter/cuda/load_helper.c
new file mode 100644
index 0000000000..62d644c29a
--- /dev/null
+++ b/libavfilter/cuda/load_helper.c
@@ -0,0 +1,96 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+
+#if CONFIG_PTX_COMPRESSION
+#include <zlib.h>
+#define CHUNK_SIZE 1024 * 64
+#endif
+
+#include "load_helper.h"
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(avctx, cu, x)
+
+int ff_cuda_load_module(void *avctx, AVCUDADeviceContext *hwctx, CUmodule *cu_module,
+                        const unsigned char *data, const unsigned int length)
+{
+    CudaFunctions *cu = hwctx->internal->cuda_dl;
+
+#if CONFIG_PTX_COMPRESSION
+    z_stream stream = { 0 };
+    uint8_t *buf, *tmp;
+    uint64_t buf_size;
+    int ret;
+
+    if (inflateInit2(&stream, 32 + 15) != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Error during zlib initialisation: %s\n", stream.msg);
+        return AVERROR(ENOSYS);
+    }
+
+    buf_size = CHUNK_SIZE * 4;
+    buf = av_realloc(NULL, buf_size);
+    if (!buf) {
+        inflateEnd(&stream);
+        return AVERROR(ENOMEM);
+    }
+
+    stream.next_in = data;
+    stream.avail_in = length;
+
+    do {
+        stream.avail_out = buf_size - stream.total_out;
+        stream.next_out = buf + stream.total_out;
+
+        ret = inflate(&stream, Z_FINISH);
+        if (ret != Z_OK && ret != Z_STREAM_END) {
+            av_log(avctx, AV_LOG_ERROR, "zlib inflate error: %s\n", stream.msg);
+            inflateEnd(&stream);
+            av_free(buf);
+            return AVERROR(EINVAL);
+        }
+
+        if (stream.avail_out == 0) {
+            buf_size += CHUNK_SIZE;
+            tmp = av_realloc(buf, buf_size);
+            if (!tmp) {
+                inflateEnd(&stream);
+                av_free(buf);
+                return AVERROR(ENOMEM);
+            }
+            buf = tmp;
+        }
+    } while (ret != Z_STREAM_END);
+
+    // NULL-terminate string
+    // there is guaranteed to be space for this, due to condition in loop
+    buf[stream.total_out] = 0;
+
+    inflateEnd(&stream);
+
+    ret = CHECK_CU(cu->cuModuleLoadData(cu_module, buf));
+    av_free(buf);
+    return ret;
+#else
+    return CHECK_CU(cu->cuModuleLoadData(cu_module, data));
+#endif
+}
diff --git a/libavfilter/cuda/load_helper.h b/libavfilter/cuda/load_helper.h
new file mode 100644
index 0000000000..31507d6d3e
--- /dev/null
+++ b/libavfilter/cuda/load_helper.h
@@ -0,0 +1,28 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_CUDA_DECOMPRESS_H
+#define AVFILTER_CUDA_DECOMPRESS_H
+
+/**
+ * Loads a CUDA module and applies any decompression, if neccesary.
+ */
+int ff_cuda_load_module(void *avctx, AVCUDADeviceContext *hwctx, CUmodule *cu_module,
+                        const unsigned char *data, const unsigned int length);
+
+#endif
diff --git a/libavfilter/vf_format_cuda.c b/libavfilter/vf_format_cuda.c
index 89f05b1350..bdbc6f8a58 100644
--- a/libavfilter/vf_format_cuda.c
+++ b/libavfilter/vf_format_cuda.c
@@ -38,6 +38,8 @@ 
 #include "internal.h"
 #include "video.h"
 
+#include "cuda/load_helper.h"
+
 static const enum AVPixelFormat supported_formats[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NV12,
@@ -236,7 +238,8 @@  static av_cold int cudaformat_config_props(AVFilterLink *outlink)
     char buf[64];
     int ret;
 
-    extern char vf_format_cuda_ptx[];
+    extern const unsigned char ff_vf_format_cuda_ptx_data[];
+    extern const unsigned int ff_vf_format_cuda_ptx_len;
 
     s->hwctx = device_hwctx;
     s->cu_stream = s->hwctx->stream;
@@ -249,7 +252,7 @@  static av_cold int cudaformat_config_props(AVFilterLink *outlink)
     if (ret < 0)
         return ret;
 
-    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_format_cuda_ptx));
+    ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, ff_vf_format_cuda_ptx_data, ff_vf_format_cuda_ptx_len);
     if (ret < 0)
         goto fail;
 
diff --git a/libavfilter/vf_overlay_cuda.c b/libavfilter/vf_overlay_cuda.c
index 260b5c8fa2..a199580869 100644
--- a/libavfilter/vf_overlay_cuda.c
+++ b/libavfilter/vf_overlay_cuda.c
@@ -36,6 +36,8 @@ 
 #include "framesync.h"
 #include "internal.h"
 
+#include "cuda/load_helper.h"
+
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, ctx->hwctx->internal->cuda_dl, x)
 #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
 
@@ -432,8 +434,8 @@  static int overlay_cuda_query_formats(AVFilterContext *avctx)
  */
 static int overlay_cuda_config_output(AVFilterLink *outlink)
 {
-
-    extern char vf_overlay_cuda_ptx[];
+    extern const unsigned char ff_vf_overlay_cuda_ptx_data[];
+    extern const unsigned int ff_vf_overlay_cuda_ptx_len;
 
     int err;
     AVFilterContext* avctx = outlink->src;
@@ -509,7 +511,7 @@  static int overlay_cuda_config_output(AVFilterLink *outlink)
         return err;
     }
 
-    err = CHECK_CU(cu->cuModuleLoadData(&ctx->cu_module, vf_overlay_cuda_ptx));
+    err = ff_cuda_load_module(ctx, ctx->hwctx, &ctx->cu_module, ff_vf_overlay_cuda_ptx_data, ff_vf_overlay_cuda_ptx_len);
     if (err < 0) {
         CHECK_CU(cu->cuCtxPopCurrent(&dummy));
         return err;
diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index d97c7df273..c10938e96b 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -39,6 +39,7 @@ 
 #include "scale_eval.h"
 #include "video.h"
 
+#include "cuda/load_helper.h"
 #include "vf_scale_cuda.h"
 
 static const enum AVPixelFormat supported_formats[] = {
@@ -275,34 +276,41 @@  static av_cold int cudascale_config_props(AVFilterLink *outlink)
     int w, h;
     int ret;
 
-    char *scaler_ptx;
+    const unsigned char *scaler_ptx;
+    unsigned int scaler_ptx_len;
     const char *function_infix = "";
 
-    extern char vf_scale_cuda_ptx[];
-    extern char vf_scale_cuda_bicubic_ptx[];
+    extern const unsigned char ff_vf_scale_cuda_ptx_data[];
+    extern const unsigned int ff_vf_scale_cuda_ptx_len;
+    extern const unsigned char ff_vf_scale_cuda_bicubic_ptx_data[];
+    extern const unsigned int ff_vf_scale_cuda_bicubic_ptx_len;
 
     switch(s->interp_algo) {
     case INTERP_ALGO_NEAREST:
-        scaler_ptx = vf_scale_cuda_ptx;
+        scaler_ptx = ff_vf_scale_cuda_ptx_data;
+        scaler_ptx_len = ff_vf_scale_cuda_ptx_len;
         function_infix = "_Nearest";
         s->interp_use_linear = 0;
         s->interp_as_integer = 1;
         break;
     case INTERP_ALGO_BILINEAR:
-        scaler_ptx = vf_scale_cuda_ptx;
+        scaler_ptx = ff_vf_scale_cuda_ptx_data;
+        scaler_ptx_len = ff_vf_scale_cuda_ptx_len;
         function_infix = "_Bilinear";
         s->interp_use_linear = 1;
         s->interp_as_integer = 1;
         break;
     case INTERP_ALGO_DEFAULT:
     case INTERP_ALGO_BICUBIC:
-        scaler_ptx = vf_scale_cuda_bicubic_ptx;
+        scaler_ptx = ff_vf_scale_cuda_bicubic_ptx_data;
+        scaler_ptx_len = ff_vf_scale_cuda_bicubic_ptx_len;
         function_infix = "_Bicubic";
         s->interp_use_linear = 0;
         s->interp_as_integer = 0;
         break;
     case INTERP_ALGO_LANCZOS:
-        scaler_ptx = vf_scale_cuda_bicubic_ptx;
+        scaler_ptx = ff_vf_scale_cuda_bicubic_ptx_data;
+        scaler_ptx_len = ff_vf_scale_cuda_bicubic_ptx_len;
         function_infix = "_Lanczos";
         s->interp_use_linear = 0;
         s->interp_as_integer = 0;
@@ -319,7 +327,7 @@  static av_cold int cudascale_config_props(AVFilterLink *outlink)
     if (ret < 0)
         goto fail;
 
-    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, scaler_ptx));
+    ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, scaler_ptx, scaler_ptx_len);
     if (ret < 0)
         goto fail;
 
diff --git a/libavfilter/vf_thumbnail_cuda.c b/libavfilter/vf_thumbnail_cuda.c
index aab3ea8cc7..ceac10f72f 100644
--- a/libavfilter/vf_thumbnail_cuda.c
+++ b/libavfilter/vf_thumbnail_cuda.c
@@ -29,6 +29,8 @@ 
 #include "avfilter.h"
 #include "internal.h"
 
+#include "cuda/load_helper.h"
+
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
 
 #define HIST_SIZE (3*256)
@@ -358,7 +360,8 @@  static int config_props(AVFilterLink *inlink)
     CudaFunctions *cu = device_hwctx->internal->cuda_dl;
     int ret;
 
-    extern char vf_thumbnail_cuda_ptx[];
+    extern const unsigned char ff_vf_thumbnail_cuda_ptx_data[];
+    extern const unsigned int ff_vf_thumbnail_cuda_ptx_len;
 
     s->hwctx = device_hwctx;
     s->cu_stream = s->hwctx->stream;
@@ -367,7 +370,7 @@  static int config_props(AVFilterLink *inlink)
     if (ret < 0)
         return ret;
 
-    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx));
+    ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, ff_vf_thumbnail_cuda_ptx_data, ff_vf_thumbnail_cuda_ptx_len);
     if (ret < 0)
         return ret;
 
diff --git a/libavfilter/vf_yadif_cuda.c b/libavfilter/vf_yadif_cuda.c
index bbdbfc1adc..5099f0a806 100644
--- a/libavfilter/vf_yadif_cuda.c
+++ b/libavfilter/vf_yadif_cuda.c
@@ -24,7 +24,10 @@ 
 #include "internal.h"
 #include "yadif.h"
 
-extern char vf_yadif_cuda_ptx[];
+#include "cuda/load_helper.h"
+
+extern const unsigned char ff_vf_yadif_cuda_ptx_data[];
+extern const unsigned int ff_vf_yadif_cuda_ptx_len;
 
 typedef struct DeintCUDAContext {
     YADIFContext yadif;
@@ -318,7 +321,7 @@  static int config_output(AVFilterLink *link)
     if (ret < 0)
         goto exit;
 
-    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_yadif_cuda_ptx));
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, ff_vf_yadif_cuda_ptx_data, ff_vf_yadif_cuda_ptx_len);
     if (ret < 0)
         goto exit;