From patchwork Tue Jul 27 19:42:08 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dylan Fernando <dylanf123@gmail.com>
X-Patchwork-Id: 29060
Delivered-To: ffmpegpatchwork2@gmail.com
Received: by 2002:a5d:965a:0:0:0:0:0 with SMTP id d26csp4516164ios;
        Tue, 27 Jul 2021 02:42:57 -0700 (PDT)
X-Google-Smtp-Source: 
 ABdhPJwLScdCAMElVYHDegqvluBqJDIgfJLucnMdn+qvmGtz/feir+M3oFVVLokPcl9ZFtKtLTzW
X-Received: by 2002:a05:6402:b79:: with SMTP id
 cb25mr26562810edb.164.1627378977118;
        Tue, 27 Jul 2021 02:42:57 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1627378977; cv=none;
        d=google.com; s=arc-20160816;
        b=a1BkFG423t1PYhvJRDiKEZI5wQOlREbN++kTFw8pqYseXIz4uQf8aV7rKpyTIvYvsz
         xD3O841jU8ORKGPZfx0aDx5GGgE6P3iG52AaJjxXVxnH3rqXKqdH/Hf5K4y+Z0qW8rHO
         MUqKesleB/W93u8/D2JTkYepmvVB0mVO9PnqrITuZeNV7Ir/TFh0Wm0seHcqKPjKIdbL
         sT8RYbIt7WbFNNns/EpyLG2hJPFAI5VRDHjEEUfqj0KjUdtx02jl6NG6JnLsvYDZaoju
         jLjeA31OaIjkV6a3+sYVN7Js3lA8ClFo2BCt+U6Ri4Z4jsvloraG5E/CAPMDs4V4kAEV
         q7Qg==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
 s=arc-20160816;
        h=sender:errors-to:reply-to:list-subscribe:list-help:list-post
         :list-archive:list-unsubscribe:list-id:precedence:subject:to
         :message-id:date:from:mime-version:dkim-signature:delivered-to;
        bh=giVUEhy68m91y+fuWqd57P0NpLv7PmnqCHzx46JhGR8=;
        b=00j41H3Kp8PotMLtwhYIgYvMSnywqTz0bzzqqR2ZpqsMm+9S8fHNRNkAoHpmEyTE5a
         u0FlmWH1q+fECWGsYYku6GnbNK/uc8K6JkVcFuX8NlzVnRjbWrO/j4EiNDjI7B7D8plB
         lVP1WcAxI6PzTv0O0+yUkEfKZUir0asVCL39SQAjX9IE0QJw64Wscz9chJ6hJgIGAnDf
         AAbQ8BAlW/m8GrwnGChJZd2WPVUQwkamA4ONv1+72hs5IcPbH7wy9IIfNQ1ms6UIX0St
         EtMluhmFNFWr6zBGYK6vUFOBYJVaeUPCx/BvJ67etL00aEQ+psPgvOzhnrJE3yax/IVa
         AoFQ==
ARC-Authentication-Results: i=1; mx.google.com;
       dkim=neutral (body hash did not verify) header.i=@gmail.com
 header.s=20161025 header.b=pe0A2GQb;
       spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender)
 smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org;
       dmarc=fail (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100])
        by mx.google.com with ESMTP id
 hs24si2554788ejc.287.2021.07.27.02.42.56;
        Tue, 27 Jul 2021 02:42:57 -0700 (PDT)
Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
Authentication-Results: mx.google.com;
       dkim=neutral (body hash did not verify) header.i=@gmail.com
 header.s=20161025 header.b=pe0A2GQb;
       spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender)
 smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org;
       dmarc=fail (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 5028F6806C7;
	Tue, 27 Jul 2021 12:42:52 +0300 (EEST)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from mail-pl1-f179.google.com (mail-pl1-f179.google.com
 [209.85.214.179])
 by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 611E2680A0D
 for <ffmpeg-devel@ffmpeg.org>; Tue, 27 Jul 2021 12:42:46 +0300 (EEST)
Received: by mail-pl1-f179.google.com with SMTP id c16so9545978plh.7
 for <ffmpeg-devel@ffmpeg.org>; Tue, 27 Jul 2021 02:42:46 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025;
 h=mime-version:from:date:message-id:subject:to;
 bh=r+XI8bOYVTO12XM7pT/MbjAdf1rpIBEljaOcUuRXngo=;
 b=pe0A2GQbxeQ2jUuotTszUglztojjIajOXfg/+VIgMgFlOjscqCt4p5qi60wZK69Ubq
 7fw79p2VfcmAe9FQ5yDiUWqHiq7Mm0yGFOZtsSzLWZ6c7wEOA/dqJxQx/DuqaWt5EyzT
 S8iF/MneK1wBkfZ0wit4lCiN1CnsIuyRQ8UzIOogZ39QP3tptuNeUjVmkgwrSYqFpbo/
 xqSvbHxkYWgLYskstOsxIfpBR0DRCdigrqKT/3OlhfHx15xIRgBtETJogagfjfx1kegS
 JSVi/Ph8MzcSkKa+RtKm8ghYY60JqsOXQdtkHHoSuIi5g/wJTUHPzwtQ34EL3yBXXwCU
 aAeA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:mime-version:from:date:message-id:subject:to;
 bh=r+XI8bOYVTO12XM7pT/MbjAdf1rpIBEljaOcUuRXngo=;
 b=Aszyfq+kc3AMiYZ82Cc9TFs+qQfh2mv56KAfG4b+wtyb0TYWZ4/PLl6KNPFfIDvlFw
 FfoGavxy2jRxfurkESvG8PMGOBOwIU4IpX13WWXy8CVen5R0ivm0GteahNQB4KOu6eiF
 Myr0dNwYPADQrNOnQFTWNq2PNUpu/BY9Gy+5mPANHlFtKrky+qHZT+fJM+1l8U+P3HCL
 2sXajsBS51VYK/1Lpj27Db2y/jsAbt5M1PYFwU8PQGCBD0Wm7W0NdcNT38Oarlv7gBO3
 1pRHM4K4mqvMvMUun5Bxtep2cFFHbxnF7kWCklz3Xsn3P7KlEAgyfmJapqMSRr0YxpM3
 5NWA==
X-Gm-Message-State: AOAM531wkWd5PCiBZnWzg35RsEOCEZDOmhYuZulakg70Lvjgr3t/j3wi
 EmF5n6td34NFVft9+JTelmKE78ROrXtI1J0K8O4HH5pz8v4=
X-Received: by 2002:a05:6a00:1693:b029:333:da3a:8c86 with SMTP id
 k19-20020a056a001693b0290333da3a8c86mr22893437pfc.41.1627378964068; Tue, 27
 Jul 2021 02:42:44 -0700 (PDT)
MIME-Version: 1.0
From: Dylan Fernando <dylanf123@gmail.com>
Date: Tue, 27 Jul 2021 19:42:08 +0000
Message-ID: 
 <CAPa-kAziQwCDLWABje-j9NTE1=OgzTdzPH2c1yKuCr6VXpJ5Kg@mail.gmail.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
X-Content-Filtered-By: Mailman/MimeDel 2.1.29
Subject: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <https://ffmpeg.org/mailman/options/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <https://ffmpeg.org/pipermail/ffmpeg-devel>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
 <mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
X-TUID: l35agCu/rRuX

Subject: [PATCH] lavfi: add nlmeans_cuda filter
---
 compat/cuda/cuda_runtime.h     |   1 +
 configure                      |   2 +
 doc/filters.texi               |   4 +
 libavfilter/Makefile           |   2 +
 libavfilter/allfilters.c       |   1 +
 libavfilter/vf_nlmeans_cuda.c  | 814 +++++++++++++++++++++++++++++++++
 libavfilter/vf_nlmeans_cuda.cu | 361 +++++++++++++++
 7 files changed, 1185 insertions(+)
 create mode 100644 libavfilter/vf_nlmeans_cuda.c
 create mode 100644 libavfilter/vf_nlmeans_cuda.cu

diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h
index c5450b2542..c1e2143dde 100644
--- a/compat/cuda/cuda_runtime.h
+++ b/compat/cuda/cuda_runtime.h
@@ -184,5 +184,6 @@ static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 
 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
 static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
+static inline __device__ float exp(float a) { return __nvvm_ex2_approx_f(a); }
 
 #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
diff --git a/configure b/configure
index 646d16e3c9..96a6fcde7d 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 66c0f87e47..a0b68fc49f 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -15228,6 +15228,10 @@ Same as @option{r} but for chroma planes.
 The default value is @var{0} and means automatic.
 @end table
 
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
 @section nnedi
 
 Deinterlace video using neural network edge directed interpolation.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 49c0c8342b..565923d85a 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -341,6 +341,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER)           += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index ae74f9c891..5fcdfecfbc 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -327,6 +327,7 @@ extern const AVFilter ff_vf_msad;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c
new file mode 100644
index 0000000000..fd7e649556
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.c
@@ -0,0 +1,814 @@
+
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P016,
+    AV_PIX_FMT_YUV444P16,
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+};
+
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+
+
+typedef struct NLMeansCudaContext {
+    const AVClass *class;
+
+    double                sigma;
+    int                   patch_size;
+    int                   patch_size_uv;
+    int                   research_size;
+    int                   research_size_uv;
+    int                   initialised;
+
+    float                 h;
+
+    AVBufferRef *hw_frames_ctx;
+    AVCUDADeviceContext *hwctx;
+
+    CUmodule    cu_module;
+
+    CUfunction  cu_func_horiz_uchar;
+    CUfunction  cu_func_horiz_uchar2;
+    CUfunction  cu_func_vert_uchar;
+    CUfunction  cu_func_vert_uchar2;
+    CUfunction  cu_func_weight_uchar;
+    CUfunction  cu_func_weight_uchar2;
+    CUfunction  cu_func_average_uchar;
+    CUfunction  cu_func_average_uchar2;
+    CUstream    cu_stream;
+
+
+    CUdeviceptr integral_img;
+    CUdeviceptr integral_img2;
+    CUdeviceptr weight;
+    CUdeviceptr weight2;
+    CUdeviceptr sum;
+    CUdeviceptr sum2;
+
+} NLMeansCudaContext;
+
+#define OFFSET(x) offsetof(NLMeansCudaContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption nlmeans_cuda_options[] = {
+    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
+    { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
+    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { "r",  "research window",                   OFFSET(research_size),    AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
+    { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { NULL }
+};
+
+
+
+static av_cold int init(AVFilterContext *avctx)
+{
+    NLMeansCudaContext *ctx = avctx->priv;
+
+    ctx->h = ctx->sigma * 10;
+    if (!(ctx->research_size & 1)) {
+        ctx->research_size |= 1;
+        av_log(avctx, AV_LOG_WARNING,
+               "research_size should be odd, set to %d",
+               ctx->research_size);
+    }
+
+    if (!(ctx->patch_size & 1)) {
+        ctx->patch_size |= 1;
+        av_log(avctx, AV_LOG_WARNING,
+               "patch_size should be odd, set to %d",
+               ctx->patch_size);
+    }
+
+    if (!ctx->research_size_uv)
+        ctx->research_size_uv = ctx->research_size;
+    if (!ctx->patch_size_uv)
+        ctx->patch_size_uv = ctx->patch_size;
+
+
+    ctx->initialised = 1;
+
+    return 0;
+}
+
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+
+static int call_average2(AVFilterContext *ctx, CUfunction func, int channels,
+                         uint8_t *src_dptr, int src_width, int src_height, int src_pitch, 
+                         float *sum, float *sum2, float *weight, float *weight2,
+                         uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                         int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+    CUtexObject tex = 0;
+    void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &sum2, &weight, &weight2};
+
+    int ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    dst_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(func,
+                                      DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static int call_average(AVFilterContext *ctx, CUfunction func, int channels,
+                        uint8_t *src_dptr, int src_width, int src_height, int src_pitch, float *sum, float *weight,
+                        uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                        int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+    CUtexObject tex = 0;
+    void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &weight};
+
+    int ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    dst_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(func,
+                                      DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+static int call_weight2(AVFilterContext *ctx, int channels,
+                        uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2, 
+                        float *sum, float *sum2, float *weight, float *weight2, int p,
+                        int *dx_cur, int *dy_cur,
+                        int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &sum, &sum2, &weight, &weight2, &integ_img, &integ_img2, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar2,
+                                      DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+
+static int call_weight(AVFilterContext *ctx, int channels,
+                       uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, float *sum, float *weight, int p,
+                       int *dx_cur, int *dy_cur,
+                       int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &sum, &weight, &integ_img, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar,
+                                      DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static int call_vert2(AVFilterContext *ctx, int channels,
+                      int src_width, int src_height, int *integ_img, int *integ_img2,
+                      int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    int ret;
+
+    void *args_uchar[] = { &integ_img, &integ_img2, &src_width, &src_height };
+
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar2,
+                                      DIV_UP(src_width, BLOCKX), 1, 1,
+                                      BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL));
+
+
+    return ret;
+}
+
+
+static int call_vert(AVFilterContext *ctx, int channels,
+                     int src_width, int src_height, int *integ_img,
+                     int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    int ret;
+
+    void *args_uchar[] = { &integ_img, &src_width, &src_height };
+
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar,
+                                      DIV_UP(src_width, BLOCKX), 1, 1,
+                                      BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL));
+
+
+
+    return ret;
+}
+
+
+static int call_horiz2(AVFilterContext *ctx, int channels,
+                       uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2,
+                       int *dx_cur, int *dy_cur,
+                       int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &integ_img, &integ_img2, &tex, &src_width, &src_height, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar2,
+                                      1, DIV_UP(src_height, BLOCKY), 1,
+                                      1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static int call_horiz(AVFilterContext *ctx, int channels,
+                      uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img,
+                      int *dx_cur, int *dy_cur,
+                      int pixel_size)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    CUtexObject tex = 0;
+    int ret;
+
+
+    void *args_uchar[] = { &integ_img, &tex, &src_width, &src_height, dx_cur, dy_cur };
+
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    src_pitch /= channels * pixel_size;
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar,
+                                      1, DIV_UP(src_height, BLOCKY), 1,
+                                      1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    return ret;
+}
+
+
+static void nlmeans_plane(AVFilterContext *ctx, int channels,
+                          uint8_t *src_dptr, int src_width, int src_height, int src_pitch, 
+                          uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, 
+                          int *integ_img,
+                          int p, int r,
+                          int pixel_size)
+{
+    NLMeansCudaContext *s   = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    
+
+    int nb_pixel, *tmp = NULL, idx = 0;
+    int *dxdy = NULL;
+    int i, dx, dy = 0;
+
+    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+    for (dx = -r; dx <= r; dx++) {
+        for (dy = -r; dy <= r; dy++) {
+            if (dx || dy) {
+                tmp[idx++] = dx;
+                tmp[idx++] = dy;
+            }
+        }
+    }
+    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+    for (i = 0; i < nb_pixel / 4; i++) {
+        dxdy[i * 8] = tmp[i * 8];         // dx0
+        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+    }
+    av_freep(&tmp);
+
+
+    // fill with 0s
+    CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream));
+
+
+    for (i = 0; i < nb_pixel / 4; i++) {
+
+        int *dx_cur = dxdy + 8 * i;
+        int *dy_cur = dxdy + 8 * i + 4;
+
+        call_horiz(ctx, 1, src_dptr, src_width, src_height, src_pitch, 
+                   integ_img, dx_cur, dy_cur, 1);
+
+        call_vert(ctx, 1, src_width, src_height, integ_img, 1);
+
+        call_weight(ctx, 1, src_dptr, src_width, src_height, src_pitch, integ_img, (float*)s->sum, (float*)s->weight, p, dx_cur, dy_cur, 1);
+    }
+
+    call_average(ctx, s->cu_func_average_uchar, 1, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->weight,
+                   dst_dptr, dst_width, dst_height, dst_pitch, 1);
+}
+
+static void nlmeans_plane2(AVFilterContext *ctx, int channels,
+                           uint8_t *src_dptr, int src_width, int src_height, int src_pitch, 
+                           uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                           int *integ_img, int *integ_img2,
+                           int p, int r,
+                           int pixel_size)
+{
+    NLMeansCudaContext *s   = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    int nb_pixel, *tmp = NULL, idx = 0;
+    int *dxdy = NULL;
+    int i, dx, dy = 0;
+
+    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+    for (dx = -r; dx <= r; dx++) {
+        for (dy = -r; dy <= r; dy++) {
+            if (dx || dy) {
+                tmp[idx++] = dx;
+                tmp[idx++] = dy;
+            }
+        }
+    }
+    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+    for (i = 0; i < nb_pixel / 4; i++) {
+        dxdy[i * 8] = tmp[i * 8];         // dx0
+        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+    }
+    av_freep(&tmp);
+
+
+    // fill with 0s
+    CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->weight2, 0, src_width * src_height * sizeof(float), s->cu_stream));
+    CHECK_CU(cu->cuMemsetD8Async(s->sum2, 0, src_width * src_height * sizeof(float), s->cu_stream));
+
+    for (i = 0; i < nb_pixel / 4; i++) {
+
+        int *dx_cur = dxdy + 8 * i;
+        int *dy_cur = dxdy + 8 * i + 4;
+
+        call_horiz2(ctx, 2, src_dptr, src_width, src_height, src_pitch, 
+                    integ_img, integ_img2, dx_cur, dy_cur, 1);
+
+        call_vert2(ctx, 2, src_width, src_height, integ_img, integ_img2, 1);
+
+        call_weight2(ctx, 2, src_dptr, src_width, src_height, src_pitch, integ_img, integ_img2, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2, p, dx_cur, dy_cur, 1);
+    }
+
+    call_average2(ctx, s->cu_func_average_uchar2, 2, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2,
+                  dst_dptr, dst_width, dst_height, dst_pitch, 1);
+}
+
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+    AVFilterContext *ctx  = inlink->dst;
+    NLMeansCudaContext *s   = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data;
+    CUcontext dummy;
+
+    AVFrame *output = NULL;
+    int err, patch, research, patch_uv, research_uv;
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, frame);
+    if (err < 0)
+        goto fail;
+
+    if (!s->initialised) {
+        init(ctx);
+    }
+
+    patch = s->patch_size / 2;
+    research = s->research_size / 2;
+    patch_uv = s->patch_size_uv / 2;
+    research_uv = s->research_size_uv / 2;
+
+    err = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (err < 0)
+        return err;
+
+
+
+    switch (hw_frames_ctx->sw_format) {
+    case AV_PIX_FMT_NV12:
+        nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0], 
+                      output->data[0], output->width, output->height, output->linesize[0],
+                      (int*)s->integral_img, patch, research, 1);
+
+        nlmeans_plane2(ctx, 2, frame->data[1], inlink->w / 2, inlink->h / 2, frame->linesize[1], 
+                       output->data[1], output->width / 2, output->height / 2, output->linesize[1],
+                       (int*)s->integral_img, (int*)s->integral_img2, patch_uv, research_uv, 1);
+
+        break;
+    default:
+        return AVERROR_BUG;
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (err < 0)
+        return err;
+
+    av_frame_free(&frame);
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    av_frame_free(&frame);
+    av_frame_free(&output);
+    return err;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    NLMeansCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+
+    if (s->cu_module) {
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+    }
+}
+
+
+static int config_props(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    NLMeansCudaContext *s = ctx->priv;
+    AVHWFramesContext     *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+    AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
+    int ret;
+
+    extern const unsigned char ff_vf_nlmeans_cuda_ptx_data[];
+    extern const unsigned int ff_vf_nlmeans_cuda_ptx_len;
+
+    int width = inlink->w;
+    int height = inlink->h;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, ff_vf_nlmeans_cuda_ptx_data, ff_vf_nlmeans_cuda_ptx_len);
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar, s->cu_module, "nlmeans_horiz_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar2, s->cu_module, "nlmeans_horiz_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar, s->cu_module, "nlmeans_vert_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar2, s->cu_module, "nlmeans_vert_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar, s->cu_module, "nlmeans_weight_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar2, s->cu_module, "nlmeans_weight_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar, s->cu_module, "nlmeans_average_uchar"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar2, s->cu_module, "nlmeans_average_uchar2"));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img, 4 * width * height * sizeof(int)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img2, 4 * width * height * sizeof(int)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->weight, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->sum, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->weight2, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->sum2, width * height * sizeof(float)));
+    if (ret < 0)
+        return ret;
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx;
+
+    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx);
+    if (!ctx->outputs[0]->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+
+    if (!format_is_supported(hw_frames_ctx->sw_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", av_get_pix_fmt_name(hw_frames_ctx->sw_format));
+        return AVERROR(ENOSYS);
+    }
+
+    return 0;
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_CUDA,
+        AV_PIX_FMT_NONE
+    };
+    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+AVFILTER_DEFINE_CLASS(nlmeans_cuda);
+
+static const AVFilterPad nlmeans_cuda_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_props,
+        .filter_frame = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad nlmeans_cuda_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+const AVFilter ff_vf_nlmeans_cuda = {
+    .name          = "nlmeans_cuda",
+    .description   = NULL_IF_CONFIG_SMALL("Non-local means denoiser through CUDA"),
+    .priv_size     = sizeof(NLMeansCudaContext),
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+    .inputs        = nlmeans_cuda_inputs,
+    .outputs       = nlmeans_cuda_outputs,
+    .priv_class    = &nlmeans_cuda_class,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_nlmeans_cuda.cu b/libavfilter/vf_nlmeans_cuda.cu
new file mode 100644
index 0000000000..74cd70aa37
--- /dev/null
+++ b/libavfilter/vf_nlmeans_cuda.cu
@@ -0,0 +1,361 @@
+#include "cuda/vector_helpers.cuh"
+
+
+extern "C" {
+
+__global__ void nlmeans_average_uchar(
+    cudaTextureObject_t tex,
+    uchar *dst,
+    int width, int dst_pitch,
+    float* sum, float* weight)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float w = weight[y * width + x];
+    float s = sum[y * width + x];
+
+    uchar src_pix_uchar = tex2D<uchar>(tex, x, y);
+    int src_pix_int = src_pix_uchar;
+    float src_pix = (float)src_pix_int;
+
+    float resultf = (s + src_pix) / (1 + w);
+
+    int result = (int)resultf;
+
+    dst[y*dst_pitch+x] = result;
+}
+
+
+__global__ void nlmeans_average_uchar2(
+    cudaTextureObject_t tex,
+    uchar2 *dst,
+    int width, int dst_pitch,
+    float* sum, float* sum2, float* weight, float* weight2)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    float w = weight[y * width + x];
+    float s = sum[y * width + x];
+
+    float w2 = weight2[y * width + x];
+    float s2 = sum2[y * width + x];
+
+    uchar2 src_pix_uchar = tex2D<uchar2>(tex, x, y);
+    
+    int src_pix_int = src_pix_uchar.x;
+    float src_pix = (float)src_pix_int;
+
+    int src_pix_int2 = src_pix_uchar.y;
+    float src_pix2 = (float)src_pix_int2;
+
+    float resultf = (s + src_pix) / (1 + w);
+    float result2f = (s2 + src_pix2) / (1 + w2);
+
+    int result = (int)resultf;
+    int result2 = (int)result2f;
+
+    uchar2 output;
+    output.x = result;
+    output.y = result2;
+
+    dst[y*dst_pitch+x] = output;
+}
+
+
+__global__ void nlmeans_weight_uchar2(
+    float* sum, float* sum2, 
+    float* weight, float* weight2,
+    int4 *integral_img,
+    int4 *integral_img2,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int p, float h,
+    int4 dx, int4 dy)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 xoff = make_int4(x, x, x, x) + dx;
+    int4 yoff = make_int4(y, y, y, y) + dy;
+
+    int4 a = make_int4(0, 0, 0, 0);
+    int4 b = make_int4(0, 0, 0, 0);
+    int4 c = make_int4(0, 0, 0, 0);
+    int4 d = make_int4(0, 0, 0, 0);
+
+    int4 a2 = make_int4(0, 0, 0, 0);
+    int4 b2 = make_int4(0, 0, 0, 0);
+    int4 c2 = make_int4(0, 0, 0, 0);
+    int4 d2 = make_int4(0, 0, 0, 0);
+
+    int4 src_pix  = make_int4(0, 0, 0, 0);
+    int4 src_pix2 = make_int4(0, 0, 0, 0);
+
+    int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+    uchar2 ax, ay, az, aw;
+    ax = tex2D<uchar2>(tex, xoff.x, yoff.x);
+    ay = tex2D<uchar2>(tex, xoff.y, yoff.y);
+    az = tex2D<uchar2>(tex, xoff.z, yoff.z);
+    aw = tex2D<uchar2>(tex, xoff.w, yoff.w);
+
+    src_pix.x = ax.x;
+    src_pix.y = ay.x;
+    src_pix.z = az.x;
+    src_pix.w = aw.x;
+
+    src_pix2.x = ax.y;
+    src_pix2.y = ay.y;
+    src_pix2.z = az.y;
+    src_pix2.w = aw.y;
+
+    if (!oobb) {
+        a = integral_img[(y - p) * width + x - p];
+        b = integral_img[(y + p) * width + x - p];
+        c = integral_img[(y - p) * width + x + p];
+        d = integral_img[(y + p) * width + x + p];
+
+        a2 = integral_img2[(y - p) * width + x - p];
+        b2 = integral_img2[(y + p) * width + x - p];
+        c2 = integral_img2[(y - p) * width + x + p];
+        d2 = integral_img2[(y + p) * width + x + p];
+    }
+
+    int4 patch_diff_int = d + a - c - b;
+    float4 patch_diff;
+    patch_diff.x = (float) (-patch_diff_int.x);
+    patch_diff.y = (float) (-patch_diff_int.y);
+    patch_diff.z = (float) (-patch_diff_int.z);
+    patch_diff.w = (float) (-patch_diff_int.w);
+
+    int4 patch_diff_int2 = d2 + a2 - c2 - b2;
+    float4 patch_diff2;
+    patch_diff2.x = (float) (-patch_diff_int2.x);
+    patch_diff2.y = (float) (-patch_diff_int2.y);
+    patch_diff2.z = (float) (-patch_diff_int2.z);
+    patch_diff2.w = (float) (-patch_diff_int2.w);
+
+
+    float4 w;
+    w.x = exp(patch_diff.x / (h*h));
+    w.y = exp(patch_diff.y / (h*h));
+    w.z = exp(patch_diff.z / (h*h));
+    w.w = exp(patch_diff.w / (h*h));
+
+    float4 w2;
+    w2.x = exp(patch_diff2.x / (h*h));
+    w2.y = exp(patch_diff2.y / (h*h));
+    w2.z = exp(patch_diff2.z / (h*h));
+    w2.w = exp(patch_diff2.w / (h*h));
+
+    float w_sum = w.x + w.y + w.z + w.w;
+    weight[y * width + x] += w_sum;
+
+    float w_sum2 = w2.x + w2.y + w2.z + w2.w;
+    weight2[y * width + x] += w_sum2;
+
+    float4 src_pixf;
+    src_pixf.x = (float)src_pix.x;
+    src_pixf.y = (float)src_pix.y;
+    src_pixf.z = (float)src_pix.z;
+    src_pixf.w = (float)src_pix.w;
+
+    float4 src_pix2f;
+    src_pix2f.x = (float)src_pix2.x;
+    src_pix2f.y = (float)src_pix2.y;
+    src_pix2f.z = (float)src_pix2.z;
+    src_pix2f.w = (float)src_pix2.w;
+
+    sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w;
+    sum2[y * width + x] += w2.x * src_pix2f.x + w2.y * src_pix2f.y + w2.z * src_pix2f.z + w2.w * src_pix2f.w;
+}
+
+
+__global__ void nlmeans_weight_uchar(
+    float* sum, float* weight,
+    int4 *integral_img,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int p, float h,
+    int4 dx, int4 dy)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 xoff = make_int4(x, x, x, x) + dx;
+    int4 yoff = make_int4(y, y, y, y) + dy;
+
+    int4 a = make_int4(0, 0, 0, 0);
+    int4 b = make_int4(0, 0, 0, 0);
+    int4 c = make_int4(0, 0, 0, 0);
+    int4 d = make_int4(0, 0, 0, 0);
+
+    int4 src_pix = make_int4(0, 0, 0, 0);
+
+    int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+    src_pix.x = tex2D<uchar>(tex, xoff.x, yoff.x);
+    src_pix.y = tex2D<uchar>(tex, xoff.y, yoff.y);
+    src_pix.z = tex2D<uchar>(tex, xoff.z, yoff.z);
+    src_pix.w = tex2D<uchar>(tex, xoff.w, yoff.w);
+
+    if (!oobb) {
+        a = integral_img[(y - p) * width + x - p];
+        b = integral_img[(y + p) * width + x - p];
+        c = integral_img[(y - p) * width + x + p];
+        d = integral_img[(y + p) * width + x + p];
+    }
+
+    int4 patch_diff_int = d + a - c - b;
+    float4 patch_diff;
+    patch_diff.x = (float) (-patch_diff_int.x);
+    patch_diff.y = (float) (-patch_diff_int.y);
+    patch_diff.z = (float) (-patch_diff_int.z);
+    patch_diff.w = (float) (-patch_diff_int.w);
+
+
+    float4 w;
+    w.x = exp(patch_diff.x / (h*h));
+    w.y = exp(patch_diff.y / (h*h));
+    w.z = exp(patch_diff.z / (h*h));
+    w.w = exp(patch_diff.w / (h*h));
+
+    float w_sum = w.x + w.y + w.z + w.w;
+    weight[y * width + x] += w_sum;
+
+    float4 src_pixf;
+    src_pixf.x = (float)src_pix.x;
+    src_pixf.y = (float)src_pix.y;
+    src_pixf.z = (float)src_pix.z;
+    src_pixf.w = (float)src_pix.w;
+
+    sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w;
+}
+
+
+__global__ void nlmeans_vert_uchar(
+    int4 *integral_img,
+    int width, int height)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+
+    for (int i = 0; i < height; i++) {
+        integral_img[i * width + x] += sum;
+        sum = integral_img[i * width + x];
+    }
+}
+
+
+__global__ void nlmeans_vert_uchar2(
+    int4 *integral_img,
+    int4 *integral_img2,
+    int width, int height)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+    int4 sum2 = make_int4(0, 0, 0, 0);
+
+    for (int i = 0; i < height; i++) {
+        integral_img[i * width + x] += sum;
+        sum = integral_img[i * width + x];
+
+        integral_img2[i * width + x] += sum2;
+        sum2 = integral_img2[i * width + x];
+    }
+}
+
+
+__global__ void nlmeans_horiz_uchar(
+    int4 *integral_img,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int4 dx, int4 dy)
+{
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+    uchar s2x, s2y, s2z, s2w;
+
+    for (int i = 0; i < width; i++) {
+        uchar s1c = tex2D<uchar>(tex, i, y);
+        int4 s1 = make_int4(s1c, s1c, s1c, s1c);
+
+        s2x = tex2D<uchar>(tex, i + dx.x, y + dy.x);
+        s2y = tex2D<uchar>(tex, i + dx.y, y + dy.y);
+        s2z = tex2D<uchar>(tex, i + dx.z, y + dy.z);
+        s2w = tex2D<uchar>(tex, i + dx.w, y + dy.w);
+
+        int4 s2 = make_int4(s2x, s2y, s2z, s2w);
+
+        //sum += (s1 - s2) * (s1 - s2);
+        int4 s3 = s1 - s2;
+        int4 s4;
+        s4.x = s3.x * s3.x;
+        s4.y = s3.y * s3.y;
+        s4.z = s3.z * s3.z;
+        s4.w = s3.w * s3.w;
+
+        sum += s4;
+        
+        integral_img[y * width + i] = sum;
+    }
+}
+
+
+__global__ void nlmeans_horiz_uchar2(
+    int4 *integral_img,
+    int4 *integral_img2,
+    cudaTextureObject_t tex,
+    int width, int height,
+    int4 dx, int4 dy)
+{
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int4 sum = make_int4(0, 0, 0, 0);
+    int4 sum2 = make_int4(0, 0, 0, 0);
+    uchar2 s2x, s2y, s2z, s2w;
+
+    for (int i = 0; i < width; i++) {
+        uchar2 s1c = tex2D<uchar2>(tex, i, y);
+        int4 s1 = make_int4(s1c.x, s1c.x, s1c.x, s1c.x);
+        int4 r1 = make_int4(s1c.y, s1c.y, s1c.y, s1c.y);
+
+        s2x = tex2D<uchar2>(tex, i + dx.x, y + dy.x);
+        s2y = tex2D<uchar2>(tex, i + dx.y, y + dy.y);
+        s2z = tex2D<uchar2>(tex, i + dx.z, y + dy.z);
+        s2w = tex2D<uchar2>(tex, i + dx.w, y + dy.w);
+
+        int4 s2 = make_int4(s2x.x, s2y.x, s2z.x, s2w.x);
+        int4 r2 = make_int4(s2x.y, s2y.y, s2z.y, s2w.y);
+
+        //sum += (s1 - s2) * (s1 - s2);
+        int4 s3 = s1 - s2;
+        int4 s4;
+        s4.x = s3.x * s3.x;
+        s4.y = s3.y * s3.y;
+        s4.z = s3.z * s3.z;
+        s4.w = s3.w * s3.w;
+
+        sum += s4;
+
+        //sum2 += (r1 - r2) * (r1 - r2);
+        int4 r3 = r1 - r2;
+        int4 r4;
+        r4.x = r3.x * r3.x;
+        r4.y = r3.y * r3.y;
+        r4.z = r3.z * r3.z;
+        r4.w = r3.w * r3.w;
+
+        sum2 += r4;
+
+        integral_img[y * width + i] = sum;
+        integral_img2[y * width + i] = sum2;
+    }
+}
+
+}