diff mbox series

[FFmpeg-devel,v3] lavfi: add nlmeans CUDA filter

Message ID CAPa-kAxtBALsvSHCdjzr8g3p-wgdML=YYWJEYcC2qSYqJPcc0A@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel,v3] lavfi: add nlmeans CUDA filter | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Dylan Fernando Sept. 8, 2021, 6:23 p.m. UTC
Subject: [PATCH] lavfi: add nlmeans_cuda filter

Signed-off-by: Dylan Fernando <dylanf123@gmail.com>
---
 configure                      |   2 +
 doc/filters.texi               |   4 +
 libavfilter/Makefile           |   2 +
 libavfilter/allfilters.c       |   1 +
 libavfilter/version.h          |   4 +-
 libavfilter/vf_nlmeans_cuda.c  | 883 +++++++++++++++++++++++++++++++++
 libavfilter/vf_nlmeans_cuda.cu | 378 ++++++++++++++
 7 files changed, 1272 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

Comments

Dylan Fernando Sept. 22, 2021, 11:42 p.m. UTC | #1
Any feedback for this?
diff mbox series

Patch

diff --git a/configure b/configure
index af410a9d11..7fa67e415e 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,8 @@  thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 9ad6031d23..b5eb9ecd33 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15380,6 +15380,10 @@  Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index af957a5ac0..7a61d7591e 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -347,6 +347,8 @@  OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)           += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 0c6b2347c8..d65c13011c 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -333,6 +333,7 @@  extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 2110048b77..306bb62ff4 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -30,8 +30,8 @@ 
 #include "libavutil/version.h"
 
 #define LIBAVFILTER_VERSION_MAJOR   8
-#define LIBAVFILTER_VERSION_MINOR   7
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MINOR   8
+#define LIBAVFILTER_VERSION_MICRO 100
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 0000000000..3ecc7c8945
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,883 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV444P
+};
+
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+
+
+typedef struct NLMeansCudaContext {
+    const AVClass *class;
+
+    double                sigma;
+    int                   patch_size;
+    int                   patch_size_uv;
+    int                   research_size;
+    int                   research_size_uv;
+    int                   initialised;
+
+    float                 h;
+
+    AVBufferRef *hw_frames_ctx;
+    AVCUDADeviceContext *hwctx;
+
+    CUmodule    cu_module;
+
+    CUfunction  cu_func_horiz_uchar;
+    CUfunction  cu_func_horiz_uchar2;
+    CUfunction  cu_func_vert_uchar;
+    CUfunction  cu_func_vert_uchar2;
+    CUfunction  cu_func_weight_uchar;
+    CUfunction  cu_func_weight_uchar2;
+    CUfunction  cu_func_average_uchar;
+    CUfunction  cu_func_average_uchar2;
+    CUstream    cu_stream;
+
+
+    CUdeviceptr integral_img;
+    CUdeviceptr integral_img2;
+    CUdeviceptr weight;
+    CUdeviceptr weight2;
+    CUdeviceptr sum;
+    CUdeviceptr sum2;
+
+} NLMeansCudaContext;
+
+#define OFFSET(x) offsetof(NLMeansCudaContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption nlmeans_cuda_options[] = {
+    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
+    { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
+    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { "r",  "research window",                   OFFSET(research_size),    AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
+    { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { NULL }
+};
+
+
+
+static av_cold int init(AVFilterContext *avctx)
+{
+    NLMeansCudaContext *ctx = avctx->priv;
+
+    ctx->h = ctx->sigma * 10;
+    if (!(ctx->research_size & 1)) {
+        ctx->research_size |= 1;
+        av_log(avctx, AV_LOG_WARNING,
+               "research_size should be odd, set to %d",
+               ctx->research_size);
+    }
+
+    if (!(ctx->patch_size & 1)) {
+        ctx->patch_size |= 1;
+        av_log(avctx, AV_LOG_WARNING,
+               "patch_size should be odd, set to %d",
+               ctx->patch_size);
+    }
+
+    if (!ctx->research_size_uv)
+        ctx->research_size_uv = ctx->research_size;
+    if (!ctx->patch_size_uv)
+        ctx->patch_size_uv = ctx->patch_size;
+
+
+    ctx->initialised = 1;
+
+    return 0;
+}
+
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+
+static int call_average2(AVFilterContext *ctx, int channels,
+                         uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+                         float *sum, float *sum2, float *weight, float *weight2,
+                         uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                         int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+    CUtexObject tex = 0;
+    void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &sum2, &weight, &weight2};
+
+    int ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    dst_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_average_uchar2,
+                                      DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static int call_average(AVFilterContext *ctx, int channels,
+                        uint8_t *src_dptr, int src_width, int src_height, int src_pitch, float *sum, float *weight,
+                        uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                        int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+    CUtexObject tex = 0;
+    void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &weight};
+
+    int ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    dst_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_average_uchar,
+                                      DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+static int call_weight2(AVFilterContext *ctx, int channels,
+                        uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2,
+                        float *sum, float *sum2, float *weight, float *weight2, int p,
+                        int *dx_cur, int *dy_cur,
+                        int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &sum, &sum2, &weight, &weight2, &integ_img, &integ_img2, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar2,
+                                      DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+
+static int call_weight(AVFilterContext *ctx, int channels,
+                       uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, float *sum, float *weight, int p,
+                       int *dx_cur, int *dy_cur,
+                       int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &sum, &weight, &integ_img, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar,
+                                      DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static int call_vert2(AVFilterContext *ctx, int channels,
+                      int src_width, int src_height, int *integ_img, int *integ_img2,
+                      int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    int ret;
+
+    void *args_uchar[] = { &integ_img, &integ_img2, &src_width, &src_height };
+
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar2,
+                                      DIV_UP(src_width, BLOCKX), 1, 1,
+                                      BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL));
+
+
+    return ret;
+}
+
+
+static int call_vert(AVFilterContext *ctx, int channels,
+                     int src_width, int src_height, int *integ_img,
+                     int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    int ret;
+
+    void *args_uchar[] = { &integ_img, &src_width, &src_height };
+
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar,
+                                      DIV_UP(src_width, BLOCKX), 1, 1,
+                                      BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL));
+
+
+
+    return ret;
+}
+
+
+static int call_horiz2(AVFilterContext *ctx, int channels,
+                       uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2,
+                       int *dx_cur, int *dy_cur,
+                       int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &integ_img, &integ_img2, &tex, &src_width, &src_height, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar2,
+                                      1, DIV_UP(src_height, BLOCKY), 1,
+                                      1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static int call_horiz(AVFilterContext *ctx, int channels,
+                      uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img,
+                      int *dx_cur, int *dy_cur,
+                      int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &integ_img, &tex, &src_width, &src_height, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar,
+                                      1, DIV_UP(src_height, BLOCKY), 1,
+                                      1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static void nlmeans_plane(AVFilterContext *ctx, int channels,
+                          uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+                          uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                          int *integ_img,
+                          int p, int r,
+                          int pixel_size)
+{
+    NLMeansCudaContext *s   = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+
+    int nb_pixel, *tmp = NULL, idx = 0;
+    int *dxdy = NULL;
+    int i, dx, dy = 0;
+
+    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+    for (dx = -r; dx <= r; dx++) {
+        for (dy = -r; dy <= r; dy++) {
+            if (dx || dy) {
+                tmp[idx++] = dx;
+                tmp[idx++] = dy;
+            }
+        }
+    }
+    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+    for (i = 0; i < nb_pixel / 4; i++) {
+        dxdy[i * 8] = tmp[i * 8];         // dx0
+        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+    }
+    av_freep(&tmp);
+
+
+    // fill with 0s
+    CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream));
+
+
+    for (i = 0; i < nb_pixel / 4; i++) {
+
+        int *dx_cur = dxdy + 8 * i;
+        int *dy_cur = dxdy + 8 * i + 4;
+
+        call_horiz(ctx, 1, src_dptr, src_width, src_height, src_pitch,
+                   integ_img, dx_cur, dy_cur, pixel_size);
+
+        call_vert(ctx, 1, src_width, src_height, integ_img, pixel_size);
+
+        call_weight(ctx, 1, src_dptr, src_width, src_height, src_pitch, integ_img, (float*)s->sum, (float*)s->weight, p, dx_cur, dy_cur, pixel_size);
+    }
+
+    call_average(ctx, 1, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->weight,
+                   dst_dptr, dst_width, dst_height, dst_pitch, pixel_size);
+}
+
+static void nlmeans_plane2(AVFilterContext *ctx, int channels,
+                           uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+                           uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                           int *integ_img, int *integ_img2,
+                           int p, int r,
+                           int pixel_size)
+{
+    NLMeansCudaContext *s   = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    int nb_pixel, *tmp = NULL, idx = 0;
+    int *dxdy = NULL;
+    int i, dx, dy = 0;
+
+    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+    for (dx = -r; dx <= r; dx++) {
+        for (dy = -r; dy <= r; dy++) {
+            if (dx || dy) {
+                tmp[idx++] = dx;
+                tmp[idx++] = dy;
+            }
+        }
+    }
+    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+    for (i = 0; i < nb_pixel / 4; i++) {
+        dxdy[i * 8] = tmp[i * 8];         // dx0
+        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+    }
+    av_freep(&tmp);
+
+
+    // fill with 0s
+    CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->weight2, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->sum2, 0, src_width * src_height * sizeof(float), s->cu_stream));
+
+    for (i = 0; i < nb_pixel / 4; i++) {
+
+        int *dx_cur = dxdy + 8 * i;
+        int *dy_cur = dxdy + 8 * i + 4;
+
+        call_horiz2(ctx, 2, src_dptr, src_width, src_height, src_pitch,
+                    integ_img, integ_img2, dx_cur, dy_cur, pixel_size);
+
+        call_vert2(ctx, 2, src_width, src_height, integ_img, integ_img2, pixel_size);
+
+        call_weight2(ctx, 2, src_dptr, src_width, src_height, src_pitch, integ_img, integ_img2, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2, p, dx_cur, dy_cur, pixel_size);
+    }
+
+    call_average2(ctx, 2, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2,
+                  dst_dptr, dst_width, dst_height, dst_pitch, pixel_size);
+}
+
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+    AVFilterContext *ctx  = inlink->dst;
+    NLMeansCudaContext *s   = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data;
+    CUcontext dummy;
+
+    AVFrame *output = NULL;
+    int err, patch, research, patch_uv, research_uv;
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, frame);
+    if (err < 0)
+        goto fail;
+
+    if (!s->initialised) {
+        init(ctx);
+    }
+
+    patch = s->patch_size / 2;
+    research = s->research_size / 2;
+    patch_uv = s->patch_size_uv / 2;
+    research_uv = s->research_size_uv / 2;
+
+    err = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (err < 0)
+        return err;
+
+
+
+    switch (hw_frames_ctx->sw_format) {
+    case AV_PIX_FMT_NV12:
+        nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0],
+                      output->data[0], output->width, output->height, output->linesize[0],
+                      (int*)s->integral_img, patch, research, 1);
+
+        nlmeans_plane2(ctx, 2, frame->data[1], inlink->w / 2, inlink->h / 2, frame->linesize[1],
+                       output->data[1], output->width / 2, output->height / 2, output->linesize[1],
+                       (int*)s->integral_img, (int*)s->integral_img2, patch_uv, research_uv, 1);
+
+        break;
+    case AV_PIX_FMT_YUV420P:
+        nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0],
+                      output->data[0], output->width, output->height, output->linesize[0],
+                      (int*)s->integral_img, patch, research, 1);
+
+        nlmeans_plane(ctx, 1, frame->data[1], inlink->w / 2, inlink->h / 2, frame->linesize[1],
+                      output->data[1], output->width / 2, output->height / 2, output->linesize[1],
+                      (int*)s->integral_img, patch_uv, research_uv, 1);
+
+        nlmeans_plane(ctx, 1, frame->data[2], inlink->w / 2, inlink->h / 2, frame->linesize[2],
+                      output->data[2], output->width / 2, output->height / 2, output->linesize[2],
+                      (int*)s->integral_img, patch_uv, research_uv, 1);
+
+        break;
+    case AV_PIX_FMT_YUV444P:
+        nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0],
+                      output->data[0], output->width, output->height, output->linesize[0],
+                      (int*)s->integral_img, patch, research, 1);
+
+        nlmeans_plane(ctx, 1, frame->data[1], inlink->w, inlink->h, frame->linesize[1],
+                      output->data[1], output->width, output->height, output->linesize[1],
+                      (int*)s->integral_img, patch_uv, research_uv, 1);
+
+        nlmeans_plane(ctx, 1, frame->data[2], inlink->w, inlink->h, frame->linesize[2],
+                      output->data[2], output->width, output->height, output->linesize[2],
+                      (int*)s->integral_img, patch_uv, research_uv, 1);
+
+        break;
+    default:
+        return AVERROR_BUG;
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (err < 0)
+        return err;
+
+    av_frame_free(&frame);
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    av_frame_free(&frame);
+    av_frame_free(&output);
+    return err;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    NLMeansCudaContext *s = ctx->priv;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+        if (s->integral_img) {
+            CHECK_CU(cu->cuMemFree(s->integral_img));
+            s->integral_img = 0;
+        }
+
+        if (s->integral_img2) {
+            CHECK_CU(cu->cuMemFree(s->integral_img2));
+            s->integral_img2 = 0;
+        }
+
+        if (s->weight) {
+            CHECK_CU(cu->cuMemFree(s->weight));
+            s->weight = 0;
+        }
+
+        if (s->weight2) {
+            CHECK_CU(cu->cuMemFree(s->weight2));
+            s->weight2 = 0;
+        }
+
+        if (s->sum) {
+            CHECK_CU(cu->cuMemFree(s->sum));
+            s->sum = 0;
+        }
+
+        if (s->sum2) {
+            CHECK_CU(cu->cuMemFree(s->sum2));
+            s->sum2 = 0;
+        }
+
+        if (s->cu_module) {
+            CHECK_CU(cu->cuModuleUnload(s->cu_module));
+            s->cu_module = NULL;
+        }
+    }
+}
+
+
+static int config_props(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    NLMeansCudaContext *s = ctx->priv;
+    AVHWFramesContext     *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+    AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
+    int ret;
+
+    extern const unsigned char ff_vf_nlmeans_cuda_ptx_data[];
+    extern const unsigned int ff_vf_nlmeans_cuda_ptx_len;
+
+    int width = inlink->w;
+    int height = inlink->h;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, ff_vf_nlmeans_cuda_ptx_data, ff_vf_nlmeans_cuda_ptx_len);
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar, s->cu_module, "nlmeans_horiz_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar2, s->cu_module, "nlmeans_horiz_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar, s->cu_module, "nlmeans_vert_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar2, s->cu_module, "nlmeans_vert_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar, s->cu_module, "nlmeans_weight_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar2, s->cu_module, "nlmeans_weight_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar, s->cu_module, "nlmeans_average_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar2, s->cu_module, "nlmeans_average_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img, 4 * width * height * sizeof(int)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img2, 4 * width * height * sizeof(int)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->weight, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->sum, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->weight2, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->sum2, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx;
+
+    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx);
+    if (!ctx->outputs[0]->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+
+    if (!format_is_supported(hw_frames_ctx->sw_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", av_get_pix_fmt_name(hw_frames_ctx->sw_format));
+        return AVERROR(ENOSYS);
+    }
+
+    return 0;
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_CUDA,
+        AV_PIX_FMT_NONE
+    };
+    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+AVFILTER_DEFINE_CLASS(nlmeans_cuda);
+
+static const AVFilterPad nlmeans_cuda_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_props,
+        .filter_frame = filter_frame,
+    }
+};
+
+static const AVFilterPad nlmeans_cuda_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+    }
+};
+
+const AVFilter ff_vf_nlmeans_cuda = {
+    .name          = "nlmeans_cuda",
+    .description   = NULL_IF_CONFIG_SMALL("Non-local means denoiser through CUDA"),
+    .priv_size     = sizeof(NLMeansCudaContext),
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    FILTER_INPUTS(nlmeans_cuda_inputs),
+    FILTER_OUTPUTS(nlmeans_cuda_outputs),
+    .priv_class    = &nlmeans_cuda_class,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_nlmeans_cuda.cu b/libavfilter/vf_nlmeans_cuda.cu
new file mode 100644
index 0000000000..b84f7cf78c
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.cu
@@ -0,0 +1,378 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "cuda/vector_helpers.cuh"
+
+
+extern "C" {
+
+__global__ void nlmeans_average_uchar(
+    cudaTextureObject_t tex,
+    uchar *dst,
+    int width, int dst_pitch,
+    float* sum, float* weight)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float w = weight[y * width + x];
+    float s = sum[y * width + x];
+
+    uchar src_pix_uchar = tex2D<uchar>(tex, x, y);
+    int src_pix_int = src_pix_uchar;
+    float src_pix = (float)src_pix_int;
+
+    float resultf = (s + src_pix) / (1 + w);
+
+    int result = (int)resultf;
+
+    dst[y*dst_pitch+x] = result;
+}
+
+
+__global__ void nlmeans_average_uchar2(
+    cudaTextureObject_t tex,
+    uchar2 *dst,
+    int width, int dst_pitch,
+    float* sum, float* sum2, float* weight, float* weight2)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float w = weight[y * width + x];
+    float s = sum[y * width + x];
+
+    float w2 = weight2[y * width + x];
+    float s2 = sum2[y * width + x];
+
+    uchar2 src_pix_uchar = tex2D<uchar2>(tex, x, y);
+
+    int src_pix_int = src_pix_uchar.x;
+    float src_pix = (float)src_pix_int;
+
+    int src_pix_int2 = src_pix_uchar.y;
+    float src_pix2 = (float)src_pix_int2;
+
+    float resultf = (s + src_pix) / (1 + w);
+    float result2f = (s2 + src_pix2) / (1 + w2);
+
+    int result = (int)resultf;
+    int result2 = (int)result2f;
+
+    uchar2 output;
+    output.x = result;
+    output.y = result2;
+
+    dst[y*dst_pitch+x] = output;
+}
+
+
+__global__ void nlmeans_weight_uchar2(
+    float* sum, float* sum2,
+    float* weight, float* weight2,
+    int4 *integral_img,
+    int4 *integral_img2,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int p, float h,
+    int4 dx, int4 dy)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 xoff = make_int4(x, x, x, x) + dx;
+    int4 yoff = make_int4(y, y, y, y) + dy;
+
+    int4 a = make_int4(0, 0, 0, 0);
+    int4 b = make_int4(0, 0, 0, 0);
+    int4 c = make_int4(0, 0, 0, 0);
+    int4 d = make_int4(0, 0, 0, 0);
+
+    int4 a2 = make_int4(0, 0, 0, 0);
+    int4 b2 = make_int4(0, 0, 0, 0);
+    int4 c2 = make_int4(0, 0, 0, 0);
+    int4 d2 = make_int4(0, 0, 0, 0);
+
+    int4 src_pix  = make_int4(0, 0, 0, 0);
+    int4 src_pix2 = make_int4(0, 0, 0, 0);
+
+    int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+    uchar2 ax, ay, az, aw;
+    ax = tex2D<uchar2>(tex, xoff.x, yoff.x);
+    ay = tex2D<uchar2>(tex, xoff.y, yoff.y);
+    az = tex2D<uchar2>(tex, xoff.z, yoff.z);
+    aw = tex2D<uchar2>(tex, xoff.w, yoff.w);
+
+    src_pix.x = ax.x;
+    src_pix.y = ay.x;
+    src_pix.z = az.x;
+    src_pix.w = aw.x;
+
+    src_pix2.x = ax.y;
+    src_pix2.y = ay.y;
+    src_pix2.z = az.y;
+    src_pix2.w = aw.y;
+
+    if (!oobb) {
+        a = integral_img[(y - p) * width + x - p];
+        b = integral_img[(y + p) * width + x - p];
+        c = integral_img[(y - p) * width + x + p];
+        d = integral_img[(y + p) * width + x + p];
+
+        a2 = integral_img2[(y - p) * width + x - p];
+        b2 = integral_img2[(y + p) * width + x - p];
+        c2 = integral_img2[(y - p) * width + x + p];
+        d2 = integral_img2[(y + p) * width + x + p];
+    }
+
+    int4 patch_diff_int = d + a - c - b;
+    float4 patch_diff;
+    patch_diff.x = (float) (-patch_diff_int.x);
+    patch_diff.y = (float) (-patch_diff_int.y);
+    patch_diff.z = (float) (-patch_diff_int.z);
+    patch_diff.w = (float) (-patch_diff_int.w);
+
+    int4 patch_diff_int2 = d2 + a2 - c2 - b2;
+    float4 patch_diff2;
+    patch_diff2.x = (float) (-patch_diff_int2.x);
+    patch_diff2.y = (float) (-patch_diff_int2.y);
+    patch_diff2.z = (float) (-patch_diff_int2.z);
+    patch_diff2.w = (float) (-patch_diff_int2.w);
+
+
+    float4 w;
+    w.x = __expf(patch_diff.x / (h*h));
+    w.y = __expf(patch_diff.y / (h*h));
+    w.z = __expf(patch_diff.z / (h*h));
+    w.w = __expf(patch_diff.w / (h*h));
+
+    float4 w2;
+    w2.x = __expf(patch_diff2.x / (h*h));
+    w2.y = __expf(patch_diff2.y / (h*h));
+    w2.z = __expf(patch_diff2.z / (h*h));
+    w2.w = __expf(patch_diff2.w / (h*h));
+
+    float w_sum = w.x + w.y + w.z + w.w;
+    weight[y * width + x] += w_sum;
+
+    float w_sum2 = w2.x + w2.y + w2.z + w2.w;
+    weight2[y * width + x] += w_sum2;
+
+    float4 src_pixf;
+    src_pixf.x = (float)src_pix.x;
+    src_pixf.y = (float)src_pix.y;
+    src_pixf.z = (float)src_pix.z;
+    src_pixf.w = (float)src_pix.w;
+
+    float4 src_pix2f;
+    src_pix2f.x = (float)src_pix2.x;
+    src_pix2f.y = (float)src_pix2.y;
+    src_pix2f.z = (float)src_pix2.z;
+    src_pix2f.w = (float)src_pix2.w;
+
+    sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w;
+    sum2[y * width + x] += w2.x * src_pix2f.x + w2.y * src_pix2f.y + w2.z * src_pix2f.z + w2.w * src_pix2f.w;
+}
+
+
+__global__ void nlmeans_weight_uchar(
+    float* sum, float* weight,
+    int4 *integral_img,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int p, float h,
+    int4 dx, int4 dy)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 xoff = make_int4(x, x, x, x) + dx;
+    int4 yoff = make_int4(y, y, y, y) + dy;
+
+    int4 a = make_int4(0, 0, 0, 0);
+    int4 b = make_int4(0, 0, 0, 0);
+    int4 c = make_int4(0, 0, 0, 0);
+    int4 d = make_int4(0, 0, 0, 0);
+
+    int4 src_pix = make_int4(0, 0, 0, 0);
+
+    int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+    src_pix.x = tex2D<uchar>(tex, xoff.x, yoff.x);
+    src_pix.y = tex2D<uchar>(tex, xoff.y, yoff.y);
+    src_pix.z = tex2D<uchar>(tex, xoff.z, yoff.z);
+    src_pix.w = tex2D<uchar>(tex, xoff.w, yoff.w);
+
+    if (!oobb) {
+        a = integral_img[(y - p) * width + x - p];
+        b = integral_img[(y + p) * width + x - p];
+        c = integral_img[(y - p) * width + x + p];
+        d = integral_img[(y + p) * width + x + p];
+    }
+
+    int4 patch_diff_int = d + a - c - b;
+    float4 patch_diff;
+    patch_diff.x = (float) (-patch_diff_int.x);
+    patch_diff.y = (float) (-patch_diff_int.y);
+    patch_diff.z = (float) (-patch_diff_int.z);
+    patch_diff.w = (float) (-patch_diff_int.w);
+
+
+    float4 w;
+    w.x = __expf(patch_diff.x / (h*h));
+    w.y = __expf(patch_diff.y / (h*h));
+    w.z = __expf(patch_diff.z / (h*h));
+    w.w = __expf(patch_diff.w / (h*h));
+
+    float w_sum = w.x + w.y + w.z + w.w;
+    weight[y * width + x] += w_sum;
+
+    float4 src_pixf;
+    src_pixf.x = (float)src_pix.x;
+    src_pixf.y = (float)src_pix.y;
+    src_pixf.z = (float)src_pix.z;
+    src_pixf.w = (float)src_pix.w;
+
+    sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w;
+}
+
+
+__global__ void nlmeans_vert_uchar(
+    int4 *integral_img,
+    int width, int height)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+
+    for (int i = 0; i < height; i++) {
+        integral_img[i * width + x] += sum;
+        sum = integral_img[i * width + x];
+    }
+}
+
+
+__global__ void nlmeans_vert_uchar2(
+    int4 *integral_img,
+    int4 *integral_img2,
+    int width, int height)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+    int4 sum2 = make_int4(0, 0, 0, 0);
+
+    for (int i = 0; i < height; i++) {
+        integral_img[i * width + x] += sum;
+        sum = integral_img[i * width + x];
+
+        integral_img2[i * width + x] += sum2;
+        sum2 = integral_img2[i * width + x];
+    }
+}
+
+
+__global__ void nlmeans_horiz_uchar(
+    int4 *integral_img,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int4 dx, int4 dy)
+{
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+    uchar s2x, s2y, s2z, s2w;
+
+    for (int i = 0; i < width; i++) {
+        uchar s1c = tex2D<uchar>(tex, i, y);
+        int4 s1 = make_int4(s1c, s1c, s1c, s1c);
+
+        s2x = tex2D<uchar>(tex, i + dx.x, y + dy.x);
+        s2y = tex2D<uchar>(tex, i + dx.y, y + dy.y);
+        s2z = tex2D<uchar>(tex, i + dx.z, y + dy.z);
+        s2w = tex2D<uchar>(tex, i + dx.w, y + dy.w);
+
+        int4 s2 = make_int4(s2x, s2y, s2z, s2w);
+
+        //sum += (s1 - s2) * (s1 - s2);
+        int4 s3 = s1 - s2;
+        int4 s4;
+        s4.x = s3.x * s3.x;
+        s4.y = s3.y * s3.y;
+        s4.z = s3.z * s3.z;
+        s4.w = s3.w * s3.w;
+
+        sum += s4;
+
+        integral_img[y * width + i] = sum;
+    }
+}
+
+
+__global__ void nlmeans_horiz_uchar2(
+    int4 *integral_img,
+    int4 *integral_img2,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int4 dx, int4 dy)
+{
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+    int4 sum2 = make_int4(0, 0, 0, 0);
+    uchar2 s2x, s2y, s2z, s2w;
+
+    for (int i = 0; i < width; i++) {
+        uchar2 s1c = tex2D<uchar2>(tex, i, y);
+        int4 s1 = make_int4(s1c.x, s1c.x, s1c.x, s1c.x);
+        int4 r1 = make_int4(s1c.y, s1c.y, s1c.y, s1c.y);
+
+        s2x = tex2D<uchar2>(tex, i + dx.x, y + dy.x);
+        s2y = tex2D<uchar2>(tex, i + dx.y, y + dy.y);
+        s2z = tex2D<uchar2>(tex, i + dx.z, y + dy.z);
+        s2w = tex2D<uchar2>(tex, i + dx.w, y + dy.w);
+
+        int4 s2 = make_int4(s2x.x, s2y.x, s2z.x, s2w.x);
+        int4 r2 = make_int4(s2x.y, s2y.y, s2z.y, s2w.y);
+
+        //sum += (s1 - s2) * (s1 - s2);
+        int4 s3 = s1 - s2;
+        int4 s4;
+        s4.x = s3.x * s3.x;
+        s4.y = s3.y * s3.y;
+        s4.z = s3.z * s3.z;
+        s4.w = s3.w * s3.w;
+
+        sum += s4;
+
+        //sum2 += (r1 - r2) * (r1 - r2);
+        int4 r3 = r1 - r2;
+        int4 r4;
+        r4.x = r3.x * r3.x;
+        r4.y = r3.y * r3.y;
+        r4.z = r3.z * r3.z;
+        r4.w = r3.w * r3.w;
+
+        sum2 += r4;
+
+        integral_img[y * width + i] = sum;
+        integral_img2[y * width + i] = sum2;
+    }
+}
+
+}