@@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
transpose_npp_filter_deps="ffnvcodec libnpp"
overlay_cuda_filter_deps="ffnvcodec"
overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+nlmeans_cuda_filter_deps="ffnvcodec"
+nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
amf_deps_any="libdl LoadLibrary"
nvenc_deps="ffnvcodec"
@@ -15380,6 +15380,10 @@ Same as @option{r} but for chroma planes.
The default value is @var{0} and means automatic.
@end table
+@section nlmeans_cuda
+
+Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}.
+
@section nnedi
Deinterlace video using neural network edge directed interpolation.
@@ -347,6 +347,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o
OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o
OBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o
OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_CUDA_FILTER) += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \
+ cuda/load_helper.o
OBJS-$(CONFIG_NNEDI_FILTER) += vf_nnedi.o
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o
@@ -333,6 +333,7 @@ extern const AVFilter ff_vf_msad;
extern const AVFilter ff_vf_negate;
extern const AVFilter ff_vf_nlmeans;
extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_cuda;
extern const AVFilter ff_vf_nnedi;
extern const AVFilter ff_vf_noformat;
extern const AVFilter ff_vf_noise;
@@ -30,8 +30,8 @@
#include "libavutil/version.h"
#define LIBAVFILTER_VERSION_MAJOR 8
-#define LIBAVFILTER_VERSION_MINOR 7
-#define LIBAVFILTER_VERSION_MICRO 101
+#define LIBAVFILTER_VERSION_MINOR 8
+#define LIBAVFILTER_VERSION_MICRO 100
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
new file mode 100644
@@ -0,0 +1,883 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+ AV_PIX_FMT_NV12,
+ AV_PIX_FMT_YUV420P,
+ AV_PIX_FMT_YUV444P
+};
+
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+
+
+typedef struct NLMeansCudaContext {
+ const AVClass *class;
+
+ double sigma;
+ int patch_size;
+ int patch_size_uv;
+ int research_size;
+ int research_size_uv;
+ int initialised;
+
+ float h;
+
+ AVBufferRef *hw_frames_ctx;
+ AVCUDADeviceContext *hwctx;
+
+ CUmodule cu_module;
+
+ CUfunction cu_func_horiz_uchar;
+ CUfunction cu_func_horiz_uchar2;
+ CUfunction cu_func_vert_uchar;
+ CUfunction cu_func_vert_uchar2;
+ CUfunction cu_func_weight_uchar;
+ CUfunction cu_func_weight_uchar2;
+ CUfunction cu_func_average_uchar;
+ CUfunction cu_func_average_uchar2;
+ CUstream cu_stream;
+
+
+ CUdeviceptr integral_img;
+ CUdeviceptr integral_img2;
+ CUdeviceptr weight;
+ CUdeviceptr weight2;
+ CUdeviceptr sum;
+ CUdeviceptr sum2;
+
+} NLMeansCudaContext;
+
+#define OFFSET(x) offsetof(NLMeansCudaContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption nlmeans_cuda_options[] = {
+ { "s", "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
+ { "p", "patch size", OFFSET(patch_size), AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
+ { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
+ { "r", "research window", OFFSET(research_size), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
+ { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
+ { NULL }
+};
+
+
+
+static av_cold int init(AVFilterContext *avctx)
+{
+ NLMeansCudaContext *ctx = avctx->priv;
+
+ ctx->h = ctx->sigma * 10;
+ if (!(ctx->research_size & 1)) {
+ ctx->research_size |= 1;
+ av_log(avctx, AV_LOG_WARNING,
+ "research_size should be odd, set to %d",
+ ctx->research_size);
+ }
+
+ if (!(ctx->patch_size & 1)) {
+ ctx->patch_size |= 1;
+ av_log(avctx, AV_LOG_WARNING,
+ "patch_size should be odd, set to %d",
+ ctx->patch_size);
+ }
+
+ if (!ctx->research_size_uv)
+ ctx->research_size_uv = ctx->research_size;
+ if (!ctx->patch_size_uv)
+ ctx->patch_size_uv = ctx->patch_size;
+
+
+ ctx->initialised = 1;
+
+ return 0;
+}
+
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+ int i;
+
+ for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+ if (supported_formats[i] == fmt)
+ return 1;
+ return 0;
+}
+
+
+static int call_average2(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+ float *sum, float *sum2, float *weight, float *weight2,
+ uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+ CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+ CUtexObject tex = 0;
+ void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &sum2, &weight, &weight2};
+
+ int ret;
+
+ CUDA_TEXTURE_DESC tex_desc = {
+ .filterMode = CU_TR_FILTER_MODE_POINT,
+ .flags = CU_TRSF_READ_AS_INTEGER,
+ };
+
+ CUDA_RESOURCE_DESC res_desc = {
+ .resType = CU_RESOURCE_TYPE_PITCH2D,
+ .res.pitch2D.format = pixel_size == 1 ?
+ CU_AD_FORMAT_UNSIGNED_INT8 :
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ .res.pitch2D.numChannels = channels,
+ .res.pitch2D.width = src_width,
+ .res.pitch2D.height = src_height,
+ .res.pitch2D.pitchInBytes = src_pitch,
+ .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+ };
+
+ dst_pitch /= channels * pixel_size;
+
+ ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+ if (ret < 0)
+ goto exit;
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_average_uchar2,
+ DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+ BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+ if (tex)
+ CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+ return ret;
+}
+
+
+static int call_average(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch, float *sum, float *weight,
+ uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+ CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+ CUtexObject tex = 0;
+ void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &weight};
+
+ int ret;
+
+ CUDA_TEXTURE_DESC tex_desc = {
+ .filterMode = CU_TR_FILTER_MODE_POINT,
+ .flags = CU_TRSF_READ_AS_INTEGER,
+ };
+
+ CUDA_RESOURCE_DESC res_desc = {
+ .resType = CU_RESOURCE_TYPE_PITCH2D,
+ .res.pitch2D.format = pixel_size == 1 ?
+ CU_AD_FORMAT_UNSIGNED_INT8 :
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ .res.pitch2D.numChannels = channels,
+ .res.pitch2D.width = src_width,
+ .res.pitch2D.height = src_height,
+ .res.pitch2D.pitchInBytes = src_pitch,
+ .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+ };
+
+ dst_pitch /= channels * pixel_size;
+
+ ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+ if (ret < 0)
+ goto exit;
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_average_uchar,
+ DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+ BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+ if (tex)
+ CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+ return ret;
+}
+
+static int call_weight2(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2,
+ float *sum, float *sum2, float *weight, float *weight2, int p,
+ int *dx_cur, int *dy_cur,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+ CUtexObject tex = 0;
+ int ret;
+
+
+ void *args_uchar[] = { &sum, &sum2, &weight, &weight2, &integ_img, &integ_img2, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur };
+
+
+ CUDA_TEXTURE_DESC tex_desc = {
+ .filterMode = CU_TR_FILTER_MODE_POINT,
+ .flags = CU_TRSF_READ_AS_INTEGER,
+ };
+
+ CUDA_RESOURCE_DESC res_desc = {
+ .resType = CU_RESOURCE_TYPE_PITCH2D,
+ .res.pitch2D.format = pixel_size == 1 ?
+ CU_AD_FORMAT_UNSIGNED_INT8 :
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ .res.pitch2D.numChannels = channels,
+ .res.pitch2D.width = src_width,
+ .res.pitch2D.height = src_height,
+ .res.pitch2D.pitchInBytes = src_pitch,
+ .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+ };
+
+ src_pitch /= channels * pixel_size;
+
+ ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+ if (ret < 0)
+ goto exit;
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar2,
+ DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1,
+ BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+ if (tex)
+ CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+ return ret;
+}
+
+
+
+static int call_weight(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, float *sum, float *weight, int p,
+ int *dx_cur, int *dy_cur,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+ CUtexObject tex = 0;
+ int ret;
+
+
+ void *args_uchar[] = { &sum, &weight, &integ_img, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur };
+
+
+ CUDA_TEXTURE_DESC tex_desc = {
+ .filterMode = CU_TR_FILTER_MODE_POINT,
+ .flags = CU_TRSF_READ_AS_INTEGER,
+ };
+
+ CUDA_RESOURCE_DESC res_desc = {
+ .resType = CU_RESOURCE_TYPE_PITCH2D,
+ .res.pitch2D.format = pixel_size == 1 ?
+ CU_AD_FORMAT_UNSIGNED_INT8 :
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ .res.pitch2D.numChannels = channels,
+ .res.pitch2D.width = src_width,
+ .res.pitch2D.height = src_height,
+ .res.pitch2D.pitchInBytes = src_pitch,
+ .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+ };
+
+ src_pitch /= channels * pixel_size;
+
+ ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+ if (ret < 0)
+ goto exit;
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar,
+ DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1,
+ BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+ if (tex)
+ CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+ return ret;
+}
+
+
+static int call_vert2(AVFilterContext *ctx, int channels,
+ int src_width, int src_height, int *integ_img, int *integ_img2,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+ int ret;
+
+ void *args_uchar[] = { &integ_img, &integ_img2, &src_width, &src_height };
+
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar2,
+ DIV_UP(src_width, BLOCKX), 1, 1,
+ BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL));
+
+
+ return ret;
+}
+
+
+static int call_vert(AVFilterContext *ctx, int channels,
+ int src_width, int src_height, int *integ_img,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+ int ret;
+
+ void *args_uchar[] = { &integ_img, &src_width, &src_height };
+
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar,
+ DIV_UP(src_width, BLOCKX), 1, 1,
+ BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL));
+
+
+
+ return ret;
+}
+
+
+static int call_horiz2(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2,
+ int *dx_cur, int *dy_cur,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+ CUtexObject tex = 0;
+ int ret;
+
+
+ void *args_uchar[] = { &integ_img, &integ_img2, &tex, &src_width, &src_height, dx_cur, dy_cur };
+
+
+ CUDA_TEXTURE_DESC tex_desc = {
+ .filterMode = CU_TR_FILTER_MODE_POINT,
+ .flags = CU_TRSF_READ_AS_INTEGER,
+ };
+
+ CUDA_RESOURCE_DESC res_desc = {
+ .resType = CU_RESOURCE_TYPE_PITCH2D,
+ .res.pitch2D.format = pixel_size == 1 ?
+ CU_AD_FORMAT_UNSIGNED_INT8 :
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ .res.pitch2D.numChannels = channels,
+ .res.pitch2D.width = src_width,
+ .res.pitch2D.height = src_height,
+ .res.pitch2D.pitchInBytes = src_pitch,
+ .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+ };
+
+ src_pitch /= channels * pixel_size;
+
+ ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+ if (ret < 0)
+ goto exit;
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar2,
+ 1, DIV_UP(src_height, BLOCKY), 1,
+ 1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+ if (tex)
+ CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+ return ret;
+}
+
+
+static int call_horiz(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img,
+ int *dx_cur, int *dy_cur,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+ CUtexObject tex = 0;
+ int ret;
+
+
+ void *args_uchar[] = { &integ_img, &tex, &src_width, &src_height, dx_cur, dy_cur };
+
+
+ CUDA_TEXTURE_DESC tex_desc = {
+ .filterMode = CU_TR_FILTER_MODE_POINT,
+ .flags = CU_TRSF_READ_AS_INTEGER,
+ };
+
+ CUDA_RESOURCE_DESC res_desc = {
+ .resType = CU_RESOURCE_TYPE_PITCH2D,
+ .res.pitch2D.format = pixel_size == 1 ?
+ CU_AD_FORMAT_UNSIGNED_INT8 :
+ CU_AD_FORMAT_UNSIGNED_INT16,
+ .res.pitch2D.numChannels = channels,
+ .res.pitch2D.width = src_width,
+ .res.pitch2D.height = src_height,
+ .res.pitch2D.pitchInBytes = src_pitch,
+ .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+ };
+
+ src_pitch /= channels * pixel_size;
+
+ ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+ if (ret < 0)
+ goto exit;
+
+ ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar,
+ 1, DIV_UP(src_height, BLOCKY), 1,
+ 1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+ if (tex)
+ CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+ return ret;
+}
+
+
+static void nlmeans_plane(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+ uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+ int *integ_img,
+ int p, int r,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+
+ int nb_pixel, *tmp = NULL, idx = 0;
+ int *dxdy = NULL;
+ int i, dx, dy = 0;
+
+ nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+ dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+ tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+ for (dx = -r; dx <= r; dx++) {
+ for (dy = -r; dy <= r; dy++) {
+ if (dx || dy) {
+ tmp[idx++] = dx;
+ tmp[idx++] = dy;
+ }
+ }
+ }
+ // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+ for (i = 0; i < nb_pixel / 4; i++) {
+ dxdy[i * 8] = tmp[i * 8]; // dx0
+ dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+ dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+ dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+ dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+ dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+ dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+ dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+ }
+ av_freep(&tmp);
+
+
+ // fill with 0s
+ CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream));
+ CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream));
+
+
+ for (i = 0; i < nb_pixel / 4; i++) {
+
+ int *dx_cur = dxdy + 8 * i;
+ int *dy_cur = dxdy + 8 * i + 4;
+
+ call_horiz(ctx, 1, src_dptr, src_width, src_height, src_pitch,
+ integ_img, dx_cur, dy_cur, pixel_size);
+
+ call_vert(ctx, 1, src_width, src_height, integ_img, pixel_size);
+
+ call_weight(ctx, 1, src_dptr, src_width, src_height, src_pitch, integ_img, (float*)s->sum, (float*)s->weight, p, dx_cur, dy_cur, pixel_size);
+ }
+
+ call_average(ctx, 1, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->weight,
+ dst_dptr, dst_width, dst_height, dst_pitch, pixel_size);
+}
+
+static void nlmeans_plane2(AVFilterContext *ctx, int channels,
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+ uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+ int *integ_img, int *integ_img2,
+ int p, int r,
+ int pixel_size)
+{
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+ int nb_pixel, *tmp = NULL, idx = 0;
+ int *dxdy = NULL;
+ int i, dx, dy = 0;
+
+ nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+ dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+ tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+ for (dx = -r; dx <= r; dx++) {
+ for (dy = -r; dy <= r; dy++) {
+ if (dx || dy) {
+ tmp[idx++] = dx;
+ tmp[idx++] = dy;
+ }
+ }
+ }
+ // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+ for (i = 0; i < nb_pixel / 4; i++) {
+ dxdy[i * 8] = tmp[i * 8]; // dx0
+ dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+ dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+ dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+ dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+ dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+ dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+ dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+ }
+ av_freep(&tmp);
+
+
+ // fill with 0s
+ CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream));
+ CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream));
+ CHECK_CU(cu->cuMemsetD8Async(s->weight2, 0, src_width * src_height * sizeof(float), s->cu_stream));
+ CHECK_CU(cu->cuMemsetD8Async(s->sum2, 0, src_width * src_height * sizeof(float), s->cu_stream));
+
+ for (i = 0; i < nb_pixel / 4; i++) {
+
+ int *dx_cur = dxdy + 8 * i;
+ int *dy_cur = dxdy + 8 * i + 4;
+
+ call_horiz2(ctx, 2, src_dptr, src_width, src_height, src_pitch,
+ integ_img, integ_img2, dx_cur, dy_cur, pixel_size);
+
+ call_vert2(ctx, 2, src_width, src_height, integ_img, integ_img2, pixel_size);
+
+ call_weight2(ctx, 2, src_dptr, src_width, src_height, src_pitch, integ_img, integ_img2, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2, p, dx_cur, dy_cur, pixel_size);
+ }
+
+ call_average2(ctx, 2, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2,
+ dst_dptr, dst_width, dst_height, dst_pitch, pixel_size);
+}
+
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+ AVFilterContext *ctx = inlink->dst;
+ NLMeansCudaContext *s = ctx->priv;
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+ AVFilterLink *outlink = ctx->outputs[0];
+ AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data;
+ CUcontext dummy;
+
+ AVFrame *output = NULL;
+ int err, patch, research, patch_uv, research_uv;
+
+ output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+ if (!output) {
+ err = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ err = av_frame_copy_props(output, frame);
+ if (err < 0)
+ goto fail;
+
+ if (!s->initialised) {
+ init(ctx);
+ }
+
+ patch = s->patch_size / 2;
+ research = s->research_size / 2;
+ patch_uv = s->patch_size_uv / 2;
+ research_uv = s->research_size_uv / 2;
+
+ err = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+ if (err < 0)
+ return err;
+
+
+
+ switch (hw_frames_ctx->sw_format) {
+ case AV_PIX_FMT_NV12:
+ nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0],
+ output->data[0], output->width, output->height, output->linesize[0],
+ (int*)s->integral_img, patch, research, 1);
+
+ nlmeans_plane2(ctx, 2, frame->data[1], inlink->w / 2, inlink->h / 2, frame->linesize[1],
+ output->data[1], output->width / 2, output->height / 2, output->linesize[1],
+ (int*)s->integral_img, (int*)s->integral_img2, patch_uv, research_uv, 1);
+
+ break;
+ case AV_PIX_FMT_YUV420P:
+ nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0],
+ output->data[0], output->width, output->height, output->linesize[0],
+ (int*)s->integral_img, patch, research, 1);
+
+ nlmeans_plane(ctx, 1, frame->data[1], inlink->w / 2, inlink->h / 2, frame->linesize[1],
+ output->data[1], output->width / 2, output->height / 2, output->linesize[1],
+ (int*)s->integral_img, patch_uv, research_uv, 1);
+
+ nlmeans_plane(ctx, 1, frame->data[2], inlink->w / 2, inlink->h / 2, frame->linesize[2],
+ output->data[2], output->width / 2, output->height / 2, output->linesize[2],
+ (int*)s->integral_img, patch_uv, research_uv, 1);
+
+ break;
+ case AV_PIX_FMT_YUV444P:
+ nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0],
+ output->data[0], output->width, output->height, output->linesize[0],
+ (int*)s->integral_img, patch, research, 1);
+
+ nlmeans_plane(ctx, 1, frame->data[1], inlink->w, inlink->h, frame->linesize[1],
+ output->data[1], output->width, output->height, output->linesize[1],
+ (int*)s->integral_img, patch_uv, research_uv, 1);
+
+ nlmeans_plane(ctx, 1, frame->data[2], inlink->w, inlink->h, frame->linesize[2],
+ output->data[2], output->width, output->height, output->linesize[2],
+ (int*)s->integral_img, patch_uv, research_uv, 1);
+
+ break;
+ default:
+ return AVERROR_BUG;
+ }
+
+ CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+ if (err < 0)
+ return err;
+
+ av_frame_free(&frame);
+
+ return ff_filter_frame(outlink, output);
+
+fail:
+ av_frame_free(&frame);
+ av_frame_free(&output);
+ return err;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+ NLMeansCudaContext *s = ctx->priv;
+
+ if (s->hwctx && s->cu_module) {
+ CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+ if (s->integral_img) {
+ CHECK_CU(cu->cuMemFree(s->integral_img));
+ s->integral_img = 0;
+ }
+
+ if (s->integral_img2) {
+ CHECK_CU(cu->cuMemFree(s->integral_img2));
+ s->integral_img2 = 0;
+ }
+
+ if (s->weight) {
+ CHECK_CU(cu->cuMemFree(s->weight));
+ s->weight = 0;
+ }
+
+ if (s->weight2) {
+ CHECK_CU(cu->cuMemFree(s->weight2));
+ s->weight2 = 0;
+ }
+
+ if (s->sum) {
+ CHECK_CU(cu->cuMemFree(s->sum));
+ s->sum = 0;
+ }
+
+ if (s->sum2) {
+ CHECK_CU(cu->cuMemFree(s->sum2));
+ s->sum2 = 0;
+ }
+
+ if (s->cu_module) {
+ CHECK_CU(cu->cuModuleUnload(s->cu_module));
+ s->cu_module = NULL;
+ }
+ }
+}
+
+
+static int config_props(AVFilterLink *inlink)
+{
+ AVFilterContext *ctx = inlink->dst;
+ NLMeansCudaContext *s = ctx->priv;
+ AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+ AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx;
+ CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+ CudaFunctions *cu = device_hwctx->internal->cuda_dl;
+ int ret;
+
+ extern const unsigned char ff_vf_nlmeans_cuda_ptx_data[];
+ extern const unsigned int ff_vf_nlmeans_cuda_ptx_len;
+
+ int width = inlink->w;
+ int height = inlink->h;
+
+ s->hwctx = device_hwctx;
+ s->cu_stream = s->hwctx->stream;
+
+ ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+ if (ret < 0)
+ return ret;
+
+ ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, ff_vf_nlmeans_cuda_ptx_data, ff_vf_nlmeans_cuda_ptx_len);
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar, s->cu_module, "nlmeans_horiz_uchar"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar2, s->cu_module, "nlmeans_horiz_uchar2"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar, s->cu_module, "nlmeans_vert_uchar"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar2, s->cu_module, "nlmeans_vert_uchar2"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar, s->cu_module, "nlmeans_weight_uchar"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar2, s->cu_module, "nlmeans_weight_uchar2"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar, s->cu_module, "nlmeans_average_uchar"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar2, s->cu_module, "nlmeans_average_uchar2"));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img, 4 * width * height * sizeof(int)));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img2, 4 * width * height * sizeof(int)));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuMemAlloc(&s->weight, width * height * sizeof(float)));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuMemAlloc(&s->sum, width * height * sizeof(float)));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuMemAlloc(&s->weight2, width * height * sizeof(float)));
+ if (ret < 0)
+ return ret;
+
+ ret = CHECK_CU(cu->cuMemAlloc(&s->sum2, width * height * sizeof(float)));
+ if (ret < 0)
+ return ret;
+
+ CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+ s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx;
+
+ ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx);
+ if (!ctx->outputs[0]->hw_frames_ctx)
+ return AVERROR(ENOMEM);
+
+
+ if (!format_is_supported(hw_frames_ctx->sw_format)) {
+ av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", av_get_pix_fmt_name(hw_frames_ctx->sw_format));
+ return AVERROR(ENOSYS);
+ }
+
+ return 0;
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+ static const enum AVPixelFormat pix_fmts[] = {
+ AV_PIX_FMT_CUDA,
+ AV_PIX_FMT_NONE
+ };
+ AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
+ if (!fmts_list)
+ return AVERROR(ENOMEM);
+ return ff_set_common_formats(ctx, fmts_list);
+}
+
+AVFILTER_DEFINE_CLASS(nlmeans_cuda);
+
+static const AVFilterPad nlmeans_cuda_inputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ .config_props = config_props,
+ .filter_frame = filter_frame,
+ }
+};
+
+static const AVFilterPad nlmeans_cuda_outputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ }
+};
+
+const AVFilter ff_vf_nlmeans_cuda = {
+ .name = "nlmeans_cuda",
+ .description = NULL_IF_CONFIG_SMALL("Non-local means denoiser through CUDA"),
+ .priv_size = sizeof(NLMeansCudaContext),
+ .init = init,
+ .uninit = uninit,
+ .query_formats = query_formats,
+ FILTER_INPUTS(nlmeans_cuda_inputs),
+ FILTER_OUTPUTS(nlmeans_cuda_outputs),
+ .priv_class = &nlmeans_cuda_class,
+ .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
new file mode 100644
@@ -0,0 +1,378 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "cuda/vector_helpers.cuh"
+
+
+extern "C" {
+
+__global__ void nlmeans_average_uchar(
+ cudaTextureObject_t tex,
+ uchar *dst,
+ int width, int dst_pitch,
+ float* sum, float* weight)
+{
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+ int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ float w = weight[y * width + x];
+ float s = sum[y * width + x];
+
+ uchar src_pix_uchar = tex2D<uchar>(tex, x, y);
+ int src_pix_int = src_pix_uchar;
+ float src_pix = (float)src_pix_int;
+
+ float resultf = (s + src_pix) / (1 + w);
+
+ int result = (int)resultf;
+
+ dst[y*dst_pitch+x] = result;
+}
+
+
+__global__ void nlmeans_average_uchar2(
+ cudaTextureObject_t tex,
+ uchar2 *dst,
+ int width, int dst_pitch,
+ float* sum, float* sum2, float* weight, float* weight2)
+{
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+ int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ float w = weight[y * width + x];
+ float s = sum[y * width + x];
+
+ float w2 = weight2[y * width + x];
+ float s2 = sum2[y * width + x];
+
+ uchar2 src_pix_uchar = tex2D<uchar2>(tex, x, y);
+
+ int src_pix_int = src_pix_uchar.x;
+ float src_pix = (float)src_pix_int;
+
+ int src_pix_int2 = src_pix_uchar.y;
+ float src_pix2 = (float)src_pix_int2;
+
+ float resultf = (s + src_pix) / (1 + w);
+ float result2f = (s2 + src_pix2) / (1 + w2);
+
+ int result = (int)resultf;
+ int result2 = (int)result2f;
+
+ uchar2 output;
+ output.x = result;
+ output.y = result2;
+
+ dst[y*dst_pitch+x] = output;
+}
+
+
+__global__ void nlmeans_weight_uchar2(
+ float* sum, float* sum2,
+ float* weight, float* weight2,
+ int4 *integral_img,
+ int4 *integral_img2,
+ cudaTextureObject_t tex,
+ int width, int height,
+ int p, float h,
+ int4 dx, int4 dy)
+{
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+ int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ int4 xoff = make_int4(x, x, x, x) + dx;
+ int4 yoff = make_int4(y, y, y, y) + dy;
+
+ int4 a = make_int4(0, 0, 0, 0);
+ int4 b = make_int4(0, 0, 0, 0);
+ int4 c = make_int4(0, 0, 0, 0);
+ int4 d = make_int4(0, 0, 0, 0);
+
+ int4 a2 = make_int4(0, 0, 0, 0);
+ int4 b2 = make_int4(0, 0, 0, 0);
+ int4 c2 = make_int4(0, 0, 0, 0);
+ int4 d2 = make_int4(0, 0, 0, 0);
+
+ int4 src_pix = make_int4(0, 0, 0, 0);
+ int4 src_pix2 = make_int4(0, 0, 0, 0);
+
+ int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+ uchar2 ax, ay, az, aw;
+ ax = tex2D<uchar2>(tex, xoff.x, yoff.x);
+ ay = tex2D<uchar2>(tex, xoff.y, yoff.y);
+ az = tex2D<uchar2>(tex, xoff.z, yoff.z);
+ aw = tex2D<uchar2>(tex, xoff.w, yoff.w);
+
+ src_pix.x = ax.x;
+ src_pix.y = ay.x;
+ src_pix.z = az.x;
+ src_pix.w = aw.x;
+
+ src_pix2.x = ax.y;
+ src_pix2.y = ay.y;
+ src_pix2.z = az.y;
+ src_pix2.w = aw.y;
+
+ if (!oobb) {
+ a = integral_img[(y - p) * width + x - p];
+ b = integral_img[(y + p) * width + x - p];
+ c = integral_img[(y - p) * width + x + p];
+ d = integral_img[(y + p) * width + x + p];
+
+ a2 = integral_img2[(y - p) * width + x - p];
+ b2 = integral_img2[(y + p) * width + x - p];
+ c2 = integral_img2[(y - p) * width + x + p];
+ d2 = integral_img2[(y + p) * width + x + p];
+ }
+
+ int4 patch_diff_int = d + a - c - b;
+ float4 patch_diff;
+ patch_diff.x = (float) (-patch_diff_int.x);
+ patch_diff.y = (float) (-patch_diff_int.y);
+ patch_diff.z = (float) (-patch_diff_int.z);
+ patch_diff.w = (float) (-patch_diff_int.w);
+
+ int4 patch_diff_int2 = d2 + a2 - c2 - b2;
+ float4 patch_diff2;
+ patch_diff2.x = (float) (-patch_diff_int2.x);
+ patch_diff2.y = (float) (-patch_diff_int2.y);
+ patch_diff2.z = (float) (-patch_diff_int2.z);
+ patch_diff2.w = (float) (-patch_diff_int2.w);
+
+
+ float4 w;
+ w.x = __expf(patch_diff.x / (h*h));
+ w.y = __expf(patch_diff.y / (h*h));
+ w.z = __expf(patch_diff.z / (h*h));
+ w.w = __expf(patch_diff.w / (h*h));
+
+ float4 w2;
+ w2.x = __expf(patch_diff2.x / (h*h));
+ w2.y = __expf(patch_diff2.y / (h*h));
+ w2.z = __expf(patch_diff2.z / (h*h));
+ w2.w = __expf(patch_diff2.w / (h*h));
+
+ float w_sum = w.x + w.y + w.z + w.w;
+ weight[y * width + x] += w_sum;
+
+ float w_sum2 = w2.x + w2.y + w2.z + w2.w;
+ weight2[y * width + x] += w_sum2;
+
+ float4 src_pixf;
+ src_pixf.x = (float)src_pix.x;
+ src_pixf.y = (float)src_pix.y;
+ src_pixf.z = (float)src_pix.z;
+ src_pixf.w = (float)src_pix.w;
+
+ float4 src_pix2f;
+ src_pix2f.x = (float)src_pix2.x;
+ src_pix2f.y = (float)src_pix2.y;
+ src_pix2f.z = (float)src_pix2.z;
+ src_pix2f.w = (float)src_pix2.w;
+
+ sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w;
+ sum2[y * width + x] += w2.x * src_pix2f.x + w2.y * src_pix2f.y + w2.z * src_pix2f.z + w2.w * src_pix2f.w;
+}
+
+
+__global__ void nlmeans_weight_uchar(
+ float* sum, float* weight,
+ int4 *integral_img,
+ cudaTextureObject_t tex,
+ int width, int height,
+ int p, float h,
+ int4 dx, int4 dy)
+{
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+ int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ int4 xoff = make_int4(x, x, x, x) + dx;
+ int4 yoff = make_int4(y, y, y, y) + dy;
+
+ int4 a = make_int4(0, 0, 0, 0);
+ int4 b = make_int4(0, 0, 0, 0);
+ int4 c = make_int4(0, 0, 0, 0);
+ int4 d = make_int4(0, 0, 0, 0);
+
+ int4 src_pix = make_int4(0, 0, 0, 0);
+
+ int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+ src_pix.x = tex2D<uchar>(tex, xoff.x, yoff.x);
+ src_pix.y = tex2D<uchar>(tex, xoff.y, yoff.y);
+ src_pix.z = tex2D<uchar>(tex, xoff.z, yoff.z);
+ src_pix.w = tex2D<uchar>(tex, xoff.w, yoff.w);
+
+ if (!oobb) {
+ a = integral_img[(y - p) * width + x - p];
+ b = integral_img[(y + p) * width + x - p];
+ c = integral_img[(y - p) * width + x + p];
+ d = integral_img[(y + p) * width + x + p];
+ }
+
+ int4 patch_diff_int = d + a - c - b;
+ float4 patch_diff;
+ patch_diff.x = (float) (-patch_diff_int.x);
+ patch_diff.y = (float) (-patch_diff_int.y);
+ patch_diff.z = (float) (-patch_diff_int.z);
+ patch_diff.w = (float) (-patch_diff_int.w);
+
+
+ float4 w;
+ w.x = __expf(patch_diff.x / (h*h));
+ w.y = __expf(patch_diff.y / (h*h));
+ w.z = __expf(patch_diff.z / (h*h));
+ w.w = __expf(patch_diff.w / (h*h));
+
+ float w_sum = w.x + w.y + w.z + w.w;
+ weight[y * width + x] += w_sum;
+
+ float4 src_pixf;
+ src_pixf.x = (float)src_pix.x;
+ src_pixf.y = (float)src_pix.y;
+ src_pixf.z = (float)src_pix.z;
+ src_pixf.w = (float)src_pix.w;
+
+ sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w;
+}
+
+
+__global__ void nlmeans_vert_uchar(
+ int4 *integral_img,
+ int width, int height)
+{
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+ int4 sum = make_int4(0, 0, 0, 0);
+
+ for (int i = 0; i < height; i++) {
+ integral_img[i * width + x] += sum;
+ sum = integral_img[i * width + x];
+ }
+}
+
+
+__global__ void nlmeans_vert_uchar2(
+ int4 *integral_img,
+ int4 *integral_img2,
+ int width, int height)
+{
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+ int4 sum = make_int4(0, 0, 0, 0);
+ int4 sum2 = make_int4(0, 0, 0, 0);
+
+ for (int i = 0; i < height; i++) {
+ integral_img[i * width + x] += sum;
+ sum = integral_img[i * width + x];
+
+ integral_img2[i * width + x] += sum2;
+ sum2 = integral_img2[i * width + x];
+ }
+}
+
+
+__global__ void nlmeans_horiz_uchar(
+ int4 *integral_img,
+ cudaTextureObject_t tex,
+ int width, int height,
+ int4 dx, int4 dy)
+{
+ int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ int4 sum = make_int4(0, 0, 0, 0);
+ uchar s2x, s2y, s2z, s2w;
+
+ for (int i = 0; i < width; i++) {
+ uchar s1c = tex2D<uchar>(tex, i, y);
+ int4 s1 = make_int4(s1c, s1c, s1c, s1c);
+
+ s2x = tex2D<uchar>(tex, i + dx.x, y + dy.x);
+ s2y = tex2D<uchar>(tex, i + dx.y, y + dy.y);
+ s2z = tex2D<uchar>(tex, i + dx.z, y + dy.z);
+ s2w = tex2D<uchar>(tex, i + dx.w, y + dy.w);
+
+ int4 s2 = make_int4(s2x, s2y, s2z, s2w);
+
+ //sum += (s1 - s2) * (s1 - s2);
+ int4 s3 = s1 - s2;
+ int4 s4;
+ s4.x = s3.x * s3.x;
+ s4.y = s3.y * s3.y;
+ s4.z = s3.z * s3.z;
+ s4.w = s3.w * s3.w;
+
+ sum += s4;
+
+ integral_img[y * width + i] = sum;
+ }
+}
+
+
+__global__ void nlmeans_horiz_uchar2(
+ int4 *integral_img,
+ int4 *integral_img2,
+ cudaTextureObject_t tex,
+ int width, int height,
+ int4 dx, int4 dy)
+{
+ int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ int4 sum = make_int4(0, 0, 0, 0);
+ int4 sum2 = make_int4(0, 0, 0, 0);
+ uchar2 s2x, s2y, s2z, s2w;
+
+ for (int i = 0; i < width; i++) {
+ uchar2 s1c = tex2D<uchar2>(tex, i, y);
+ int4 s1 = make_int4(s1c.x, s1c.x, s1c.x, s1c.x);
+ int4 r1 = make_int4(s1c.y, s1c.y, s1c.y, s1c.y);
+
+ s2x = tex2D<uchar2>(tex, i + dx.x, y + dy.x);
+ s2y = tex2D<uchar2>(tex, i + dx.y, y + dy.y);
+ s2z = tex2D<uchar2>(tex, i + dx.z, y + dy.z);
+ s2w = tex2D<uchar2>(tex, i + dx.w, y + dy.w);
+
+ int4 s2 = make_int4(s2x.x, s2y.x, s2z.x, s2w.x);
+ int4 r2 = make_int4(s2x.y, s2y.y, s2z.y, s2w.y);
+
+ //sum += (s1 - s2) * (s1 - s2);
+ int4 s3 = s1 - s2;
+ int4 s4;
+ s4.x = s3.x * s3.x;
+ s4.y = s3.y * s3.y;
+ s4.z = s3.z * s3.z;
+ s4.w = s3.w * s3.w;
+
+ sum += s4;
+
+ //sum2 += (r1 - r2) * (r1 - r2);
+ int4 r3 = r1 - r2;
+ int4 r4;
+ r4.x = r3.x * r3.x;
+ r4.y = r3.y * r3.y;
+ r4.z = r3.z * r3.z;
+ r4.w = r3.w * r3.w;
+
+ sum2 += r4;
+
+ integral_img[y * width + i] = sum;
+ integral_img2[y * width + i] = sum2;
+ }
+}
+
+}
Subject: [PATCH] lavfi: add nlmeans_cuda filter Signed-off-by: Dylan Fernando <dylanf123@gmail.com> --- configure | 2 + doc/filters.texi | 4 + libavfilter/Makefile | 2 + libavfilter/allfilters.c | 1 + libavfilter/version.h | 4 +- libavfilter/vf_nlmeans_cuda.c | 883 +++++++++++++++++++++++++++++++++ libavfilter/vf_nlmeans_cuda.cu | 378 ++++++++++++++ 7 files changed, 1272 insertions(+), 2 deletions(-) create mode 100644 libavfilter/vf_nlmeans_cuda.c create mode 100644 libavfilter/vf_nlmeans_cuda.cu