[FFmpeg-devel,v1] lavfi: add nlmeans CUDA filter

Message ID	CAPa-kAziQwCDLWABje-j9NTE1=OgzTdzPH2c1yKuCr6VXpJ5Kg@mail.gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; MIME-Version: 1.0 From: Dylan Fernando <dylanf123@gmail.com> Date: Tue, 27 Jul 2021 19:42:08 +0000 Message-ID: <CAPa-kAziQwCDLWABje-j9NTE1=OgzTdzPH2c1yKuCr6VXpJ5Kg@mail.gmail.com> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: multipart/mixed; boundary="00000000000011c4a105c817b14f" Subject: [FFmpeg-devel] [PATCH v1] lavfi: add nlmeans CUDA filter Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,v1] lavfi: add nlmeans CUDA filter \| expand [FFmpeg-devel,v1] lavfi: add nlmeans CUDA filter

Context	Check	Description
andriy/x86_make	success	Make finished
andriy/x86_make_fate	fail	Make fate failed
andriy/PPC64_make	success	Make finished
andriy/PPC64_make_fate	warning	Make fate failed

diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index c5450b2542..c1e2143dde 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -184,5 +184,6 @@ static inline __device__ double fabs(double a) { return __builtin_fabs(a); } static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); } +static inline __device__ float exp(float a) { return __nvvm_ex2_approx_f(a); } #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */ diff --git a/configure b/configure index 646d16e3c9..96a6fcde7d 100755 --- a/configure +++ b/configure @@ -3094,6 +3094,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" overlay_cuda_filter_deps="ffnvcodec" overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +nlmeans_cuda_filter_deps="ffnvcodec" +nlmeans_cuda_filter_deps_any="cuda_nvcc cuda_llvm" amf_deps_any="libdl LoadLibrary" nvenc_deps="ffnvcodec" diff --git a/doc/filters.texi b/doc/filters.texi index 66c0f87e47..a0b68fc49f 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -15228,6 +15228,10 @@ Same as @option{r} but for chroma planes. The default value is @var{0} and means automatic. @end table +@section nlmeans_cuda + +Non-local Means denoise filter through CUDA, this filter accepts same options as @ref{nlmeans}. + @section nnedi Deinterlace video using neural network edge directed interpolation. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 49c0c8342b..565923d85a 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -341,6 +341,8 @@ OBJS-$(CONFIG_MPDECIMATE_FILTER) += vf_mpdecimate.o OBJS-$(CONFIG_NEGATE_FILTER) += vf_lut.o OBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o +OBJS-$(CONFIG_NLMEANS_CUDA_FILTER) += vf_nlmeans_cuda.o vf_nlmeans_cuda.ptx.o \ + cuda/load_helper.o OBJS-$(CONFIG_NNEDI_FILTER) += vf_nnedi.o OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index ae74f9c891..5fcdfecfbc 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -327,6 +327,7 @@ extern const AVFilter ff_vf_msad; extern const AVFilter ff_vf_negate; extern const AVFilter ff_vf_nlmeans; extern const AVFilter ff_vf_nlmeans_opencl; +extern const AVFilter ff_vf_nlmeans_cuda; extern const AVFilter ff_vf_nnedi; extern const AVFilter ff_vf_noformat; extern const AVFilter ff_vf_noise; diff --git a/libavfilter/vf_nlmeans_cuda.c b/libavfilter/vf_nlmeans_cuda.c new file mode 100644 index 0000000000..fd7e649556 --- /dev/null +++ b/libavfilter/vf_nlmeans_cuda.c @@ -0,0 +1,814 @@ + +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" + +#include "avfilter.h" +#include "internal.h" + +#include "cuda/load_helper.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV444P, + AV_PIX_FMT_P010, + AV_PIX_FMT_P016, + AV_PIX_FMT_YUV444P16, + AV_PIX_FMT_0RGB32, + AV_PIX_FMT_0BGR32, +}; + + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + + + +typedef struct NLMeansCudaContext { + const AVClass *class; + + double sigma; + int patch_size; + int patch_size_uv; + int research_size; + int research_size_uv; + int initialised; + + float h; + + AVBufferRef *hw_frames_ctx; + AVCUDADeviceContext *hwctx; + + CUmodule cu_module; + + CUfunction cu_func_horiz_uchar; + CUfunction cu_func_horiz_uchar2; + CUfunction cu_func_vert_uchar; + CUfunction cu_func_vert_uchar2; + CUfunction cu_func_weight_uchar; + CUfunction cu_func_weight_uchar2; + CUfunction cu_func_average_uchar; + CUfunction cu_func_average_uchar2; + CUstream cu_stream; + + + CUdeviceptr integral_img; + CUdeviceptr integral_img2; + CUdeviceptr weight; + CUdeviceptr weight2; + CUdeviceptr sum; + CUdeviceptr sum2; + +} NLMeansCudaContext; + +#define OFFSET(x) offsetof(NLMeansCudaContext, x) +#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM + +static const AVOption nlmeans_cuda_options[] = { + { "s", "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS }, + { "p", "patch size", OFFSET(patch_size), AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS }, + { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS }, + { "r", "research window", OFFSET(research_size), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS }, + { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS }, + { NULL } +}; + + + +static av_cold int init(AVFilterContext *avctx) +{ + NLMeansCudaContext *ctx = avctx->priv; + + ctx->h = ctx->sigma * 10; + if (!(ctx->research_size & 1)) { + ctx->research_size |= 1; + av_log(avctx, AV_LOG_WARNING, + "research_size should be odd, set to %d", + ctx->research_size); + } + + if (!(ctx->patch_size & 1)) { + ctx->patch_size |= 1; + av_log(avctx, AV_LOG_WARNING, + "patch_size should be odd, set to %d", + ctx->patch_size); + } + + if (!ctx->research_size_uv) + ctx->research_size_uv = ctx->research_size; + if (!ctx->patch_size_uv) + ctx->patch_size_uv = ctx->patch_size; + + + ctx->initialised = 1; + + return 0; +} + + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + + +static int call_average2(AVFilterContext *ctx, CUfunction func, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, + float *sum, float *sum2, float *weight, float *weight2, + uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; + CUtexObject tex = 0; + void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &sum2, &weight, &weight2}; + + int ret; + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + dst_pitch /= channels * pixel_size; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; +} + + +static int call_average(AVFilterContext *ctx, CUfunction func, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, float *sum, float *weight, + uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; + CUtexObject tex = 0; + void *args_uchar[] = { &tex, &dst_devptr, &src_width, &dst_pitch, &sum, &weight}; + + int ret; + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + dst_pitch /= channels * pixel_size; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; +} + +static int call_weight2(AVFilterContext *ctx, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2, + float *sum, float *sum2, float *weight, float *weight2, int p, + int *dx_cur, int *dy_cur, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + CUtexObject tex = 0; + int ret; + + + void *args_uchar[] = { &sum, &sum2, &weight, &weight2, &integ_img, &integ_img2, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur }; + + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + src_pitch /= channels * pixel_size; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar2, + DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; +} + + + +static int call_weight(AVFilterContext *ctx, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, float *sum, float *weight, int p, + int *dx_cur, int *dy_cur, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + CUtexObject tex = 0; + int ret; + + + void *args_uchar[] = { &sum, &weight, &integ_img, &tex, &src_width, &src_height, &p, &s->h, dx_cur, dy_cur }; + + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + src_pitch /= channels * pixel_size; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_weight_uchar, + DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; +} + + +static int call_vert2(AVFilterContext *ctx, int channels, + int src_width, int src_height, int *integ_img, int *integ_img2, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + int ret; + + void *args_uchar[] = { &integ_img, &integ_img2, &src_width, &src_height }; + + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar2, + DIV_UP(src_width, BLOCKX), 1, 1, + BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL)); + + + return ret; +} + + +static int call_vert(AVFilterContext *ctx, int channels, + int src_width, int src_height, int *integ_img, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + int ret; + + void *args_uchar[] = { &integ_img, &src_width, &src_height }; + + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_vert_uchar, + DIV_UP(src_width, BLOCKX), 1, 1, + BLOCKX, 1, 1, 0, s->cu_stream, args_uchar, NULL)); + + + + return ret; +} + + +static int call_horiz2(AVFilterContext *ctx, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, int *integ_img2, + int *dx_cur, int *dy_cur, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + CUtexObject tex = 0; + int ret; + + + void *args_uchar[] = { &integ_img, &integ_img2, &tex, &src_width, &src_height, dx_cur, dy_cur }; + + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + src_pitch /= channels * pixel_size; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar2, + 1, DIV_UP(src_height, BLOCKY), 1, + 1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; +} + + +static int call_horiz(AVFilterContext *ctx, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int *integ_img, + int *dx_cur, int *dy_cur, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + CUtexObject tex = 0; + int ret; + + + void *args_uchar[] = { &integ_img, &tex, &src_width, &src_height, dx_cur, dy_cur }; + + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + src_pitch /= channels * pixel_size; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_horiz_uchar, + 1, DIV_UP(src_height, BLOCKY), 1, + 1, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; +} + + +static void nlmeans_plane(AVFilterContext *ctx, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, + uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, + int *integ_img, + int p, int r, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + + int nb_pixel, *tmp = NULL, idx = 0; + int *dxdy = NULL; + int i, dx, dy = 0; + + nb_pixel = (2 * r + 1) * (2 * r + 1) - 1; + dxdy = av_malloc(nb_pixel * 2 * sizeof(int)); + tmp = av_malloc(nb_pixel * 2 * sizeof(int)); + + for (dx = -r; dx <= r; dx++) { + for (dy = -r; dy <= r; dy++) { + if (dx || dy) { + tmp[idx++] = dx; + tmp[idx++] = dy; + } + } + } + // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch + for (i = 0; i < nb_pixel / 4; i++) { + dxdy[i * 8] = tmp[i * 8]; // dx0 + dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1 + dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2 + dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3 + dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0 + dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1 + dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2 + dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3 + } + av_freep(&tmp); + + + // fill with 0s + CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream)); + CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream)); + + + for (i = 0; i < nb_pixel / 4; i++) { + + int *dx_cur = dxdy + 8 * i; + int *dy_cur = dxdy + 8 * i + 4; + + call_horiz(ctx, 1, src_dptr, src_width, src_height, src_pitch, + integ_img, dx_cur, dy_cur, 1); + + call_vert(ctx, 1, src_width, src_height, integ_img, 1); + + call_weight(ctx, 1, src_dptr, src_width, src_height, src_pitch, integ_img, (float*)s->sum, (float*)s->weight, p, dx_cur, dy_cur, 1); + } + + call_average(ctx, s->cu_func_average_uchar, 1, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->weight, + dst_dptr, dst_width, dst_height, dst_pitch, 1); +} + +static void nlmeans_plane2(AVFilterContext *ctx, int channels, + uint8_t *src_dptr, int src_width, int src_height, int src_pitch, + uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, + int *integ_img, int *integ_img2, + int p, int r, + int pixel_size) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + int nb_pixel, *tmp = NULL, idx = 0; + int *dxdy = NULL; + int i, dx, dy = 0; + + nb_pixel = (2 * r + 1) * (2 * r + 1) - 1; + dxdy = av_malloc(nb_pixel * 2 * sizeof(int)); + tmp = av_malloc(nb_pixel * 2 * sizeof(int)); + + for (dx = -r; dx <= r; dx++) { + for (dy = -r; dy <= r; dy++) { + if (dx || dy) { + tmp[idx++] = dx; + tmp[idx++] = dy; + } + } + } + // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch + for (i = 0; i < nb_pixel / 4; i++) { + dxdy[i * 8] = tmp[i * 8]; // dx0 + dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1 + dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2 + dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3 + dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0 + dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1 + dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2 + dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3 + } + av_freep(&tmp); + + + // fill with 0s + CHECK_CU(cu->cuMemsetD8Async(s->weight, 0, src_width * src_height * sizeof(float), s->cu_stream)); + CHECK_CU(cu->cuMemsetD8Async(s->sum, 0, src_width * src_height * sizeof(float), s->cu_stream)); + CHECK_CU(cu->cuMemsetD8Async(s->weight2, 0, src_width * src_height * sizeof(float), s->cu_stream)); + CHECK_CU(cu->cuMemsetD8Async(s->sum2, 0, src_width * src_height * sizeof(float), s->cu_stream)); + + for (i = 0; i < nb_pixel / 4; i++) { + + int *dx_cur = dxdy + 8 * i; + int *dy_cur = dxdy + 8 * i + 4; + + call_horiz2(ctx, 2, src_dptr, src_width, src_height, src_pitch, + integ_img, integ_img2, dx_cur, dy_cur, 1); + + call_vert2(ctx, 2, src_width, src_height, integ_img, integ_img2, 1); + + call_weight2(ctx, 2, src_dptr, src_width, src_height, src_pitch, integ_img, integ_img2, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2, p, dx_cur, dy_cur, 1); + } + + call_average2(ctx, s->cu_func_average_uchar2, 2, src_dptr, src_width, src_height, src_pitch, (float*)s->sum, (float*)s->sum2, (float*)s->weight, (float*)s->weight2, + dst_dptr, dst_width, dst_height, dst_pitch, 1); +} + + +static int filter_frame(AVFilterLink *inlink, AVFrame *frame) +{ + AVFilterContext *ctx = inlink->dst; + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + AVFilterLink *outlink = ctx->outputs[0]; + AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data; + CUcontext dummy; + + AVFrame *output = NULL; + int err, patch, research, patch_uv, research_uv; + + output = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!output) { + err = AVERROR(ENOMEM); + goto fail; + } + + err = av_frame_copy_props(output, frame); + if (err < 0) + goto fail; + + if (!s->initialised) { + init(ctx); + } + + patch = s->patch_size / 2; + research = s->research_size / 2; + patch_uv = s->patch_size_uv / 2; + research_uv = s->research_size_uv / 2; + + err = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (err < 0) + return err; + + + + switch (hw_frames_ctx->sw_format) { + case AV_PIX_FMT_NV12: + nlmeans_plane(ctx, 1, frame->data[0], inlink->w, inlink->h, frame->linesize[0], + output->data[0], output->width, output->height, output->linesize[0], + (int*)s->integral_img, patch, research, 1); + + nlmeans_plane2(ctx, 2, frame->data[1], inlink->w / 2, inlink->h / 2, frame->linesize[1], + output->data[1], output->width / 2, output->height / 2, output->linesize[1], + (int*)s->integral_img, (int*)s->integral_img2, patch_uv, research_uv, 1); + + break; + default: + return AVERROR_BUG; + } + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (err < 0) + return err; + + av_frame_free(&frame); + + return ff_filter_frame(outlink, output); + +fail: + av_frame_free(&frame); + av_frame_free(&output); + return err; +} + +static av_cold void uninit(AVFilterContext *ctx) +{ + NLMeansCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + + if (s->cu_module) { + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + } +} + + +static int config_props(AVFilterLink *inlink) +{ + AVFilterContext *ctx = inlink->dst; + NLMeansCudaContext *s = ctx->priv; + AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx; + CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; + CudaFunctions *cu = device_hwctx->internal->cuda_dl; + int ret; + + extern const unsigned char ff_vf_nlmeans_cuda_ptx_data[]; + extern const unsigned int ff_vf_nlmeans_cuda_ptx_len; + + int width = inlink->w; + int height = inlink->h; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module, ff_vf_nlmeans_cuda_ptx_data, ff_vf_nlmeans_cuda_ptx_len); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar, s->cu_module, "nlmeans_horiz_uchar")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_horiz_uchar2, s->cu_module, "nlmeans_horiz_uchar2")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar, s->cu_module, "nlmeans_vert_uchar")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_vert_uchar2, s->cu_module, "nlmeans_vert_uchar2")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar, s->cu_module, "nlmeans_weight_uchar")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_weight_uchar2, s->cu_module, "nlmeans_weight_uchar2")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar, s->cu_module, "nlmeans_average_uchar")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_average_uchar2, s->cu_module, "nlmeans_average_uchar2")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img, 4 * width * height * sizeof(int))); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->integral_img2, 4 * width * height * sizeof(int))); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->weight, width * height * sizeof(float))); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->sum, width * height * sizeof(float))); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->weight2, width * height * sizeof(float))); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->sum2, width * height * sizeof(float))); + if (ret < 0) + return ret; + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx; + + ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx); + if (!ctx->outputs[0]->hw_frames_ctx) + return AVERROR(ENOMEM); + + + if (!format_is_supported(hw_frames_ctx->sw_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", av_get_pix_fmt_name(hw_frames_ctx->sw_format)); + return AVERROR(ENOSYS); + } + + return 0; +} + +static int query_formats(AVFilterContext *ctx) +{ + static const enum AVPixelFormat pix_fmts[] = { + AV_PIX_FMT_CUDA, + AV_PIX_FMT_NONE + }; + AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts); + if (!fmts_list) + return AVERROR(ENOMEM); + return ff_set_common_formats(ctx, fmts_list); +} + +AVFILTER_DEFINE_CLASS(nlmeans_cuda); + +static const AVFilterPad nlmeans_cuda_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = config_props, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad nlmeans_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, + { NULL } +}; + +const AVFilter ff_vf_nlmeans_cuda = { + .name = "nlmeans_cuda", + .description = NULL_IF_CONFIG_SMALL("Non-local means denoiser through CUDA"), + .priv_size = sizeof(NLMeansCudaContext), + .init = init, + .uninit = uninit, + .query_formats = query_formats, + .inputs = nlmeans_cuda_inputs, + .outputs = nlmeans_cuda_outputs, + .priv_class = &nlmeans_cuda_class, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_nlmeans_cuda.cu b/libavfilter/vf_nlmeans_cuda.cu new file mode 100644 index 0000000000..74cd70aa37 --- /dev/null +++ b/libavfilter/vf_nlmeans_cuda.cu @@ -0,0 +1,361 @@ +#include "cuda/vector_helpers.cuh" + + +extern "C" { + +__global__ void nlmeans_average_uchar( + cudaTextureObject_t tex, + uchar *dst, + int width, int dst_pitch, + float* sum, float* weight) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + float w = weight[y * width + x]; + float s = sum[y * width + x]; + + uchar src_pix_uchar = tex2D<uchar>(tex, x, y); + int src_pix_int = src_pix_uchar; + float src_pix = (float)src_pix_int; + + float resultf = (s + src_pix) / (1 + w); + + int result = (int)resultf; + + dst[y*dst_pitch+x] = result; +} + + +__global__ void nlmeans_average_uchar2( + cudaTextureObject_t tex, + uchar2 *dst, + int width, int dst_pitch, + float* sum, float* sum2, float* weight, float* weight2) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + float w = weight[y * width + x]; + float s = sum[y * width + x]; + + float w2 = weight2[y * width + x]; + float s2 = sum2[y * width + x]; + + uchar2 src_pix_uchar = tex2D<uchar2>(tex, x, y); + + int src_pix_int = src_pix_uchar.x; + float src_pix = (float)src_pix_int; + + int src_pix_int2 = src_pix_uchar.y; + float src_pix2 = (float)src_pix_int2; + + float resultf = (s + src_pix) / (1 + w); + float result2f = (s2 + src_pix2) / (1 + w2); + + int result = (int)resultf; + int result2 = (int)result2f; + + uchar2 output; + output.x = result; + output.y = result2; + + dst[y*dst_pitch+x] = output; +} + + +__global__ void nlmeans_weight_uchar2( + float* sum, float* sum2, + float* weight, float* weight2, + int4 *integral_img, + int4 *integral_img2, + cudaTextureObject_t tex, + int width, int height, + int p, float h, + int4 dx, int4 dy) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + int4 xoff = make_int4(x, x, x, x) + dx; + int4 yoff = make_int4(y, y, y, y) + dy; + + int4 a = make_int4(0, 0, 0, 0); + int4 b = make_int4(0, 0, 0, 0); + int4 c = make_int4(0, 0, 0, 0); + int4 d = make_int4(0, 0, 0, 0); + + int4 a2 = make_int4(0, 0, 0, 0); + int4 b2 = make_int4(0, 0, 0, 0); + int4 c2 = make_int4(0, 0, 0, 0); + int4 d2 = make_int4(0, 0, 0, 0); + + int4 src_pix = make_int4(0, 0, 0, 0); + int4 src_pix2 = make_int4(0, 0, 0, 0); + + int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width; + + uchar2 ax, ay, az, aw; + ax = tex2D<uchar2>(tex, xoff.x, yoff.x); + ay = tex2D<uchar2>(tex, xoff.y, yoff.y); + az = tex2D<uchar2>(tex, xoff.z, yoff.z); + aw = tex2D<uchar2>(tex, xoff.w, yoff.w); + + src_pix.x = ax.x; + src_pix.y = ay.x; + src_pix.z = az.x; + src_pix.w = aw.x; + + src_pix2.x = ax.y; + src_pix2.y = ay.y; + src_pix2.z = az.y; + src_pix2.w = aw.y; + + if (!oobb) { + a = integral_img[(y - p) * width + x - p]; + b = integral_img[(y + p) * width + x - p]; + c = integral_img[(y - p) * width + x + p]; + d = integral_img[(y + p) * width + x + p]; + + a2 = integral_img2[(y - p) * width + x - p]; + b2 = integral_img2[(y + p) * width + x - p]; + c2 = integral_img2[(y - p) * width + x + p]; + d2 = integral_img2[(y + p) * width + x + p]; + } + + int4 patch_diff_int = d + a - c - b; + float4 patch_diff; + patch_diff.x = (float) (-patch_diff_int.x); + patch_diff.y = (float) (-patch_diff_int.y); + patch_diff.z = (float) (-patch_diff_int.z); + patch_diff.w = (float) (-patch_diff_int.w); + + int4 patch_diff_int2 = d2 + a2 - c2 - b2; + float4 patch_diff2; + patch_diff2.x = (float) (-patch_diff_int2.x); + patch_diff2.y = (float) (-patch_diff_int2.y); + patch_diff2.z = (float) (-patch_diff_int2.z); + patch_diff2.w = (float) (-patch_diff_int2.w); + + + float4 w; + w.x = exp(patch_diff.x / (h*h)); + w.y = exp(patch_diff.y / (h*h)); + w.z = exp(patch_diff.z / (h*h)); + w.w = exp(patch_diff.w / (h*h)); + + float4 w2; + w2.x = exp(patch_diff2.x / (h*h)); + w2.y = exp(patch_diff2.y / (h*h)); + w2.z = exp(patch_diff2.z / (h*h)); + w2.w = exp(patch_diff2.w / (h*h)); + + float w_sum = w.x + w.y + w.z + w.w; + weight[y * width + x] += w_sum; + + float w_sum2 = w2.x + w2.y + w2.z + w2.w; + weight2[y * width + x] += w_sum2; + + float4 src_pixf; + src_pixf.x = (float)src_pix.x; + src_pixf.y = (float)src_pix.y; + src_pixf.z = (float)src_pix.z; + src_pixf.w = (float)src_pix.w; + + float4 src_pix2f; + src_pix2f.x = (float)src_pix2.x; + src_pix2f.y = (float)src_pix2.y; + src_pix2f.z = (float)src_pix2.z; + src_pix2f.w = (float)src_pix2.w; + + sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w; + sum2[y * width + x] += w2.x * src_pix2f.x + w2.y * src_pix2f.y + w2.z * src_pix2f.z + w2.w * src_pix2f.w; +} + + +__global__ void nlmeans_weight_uchar( + float* sum, float* weight, + int4 *integral_img, + cudaTextureObject_t tex, + int width, int height, + int p, float h, + int4 dx, int4 dy) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + int4 xoff = make_int4(x, x, x, x) + dx; + int4 yoff = make_int4(y, y, y, y) + dy; + + int4 a = make_int4(0, 0, 0, 0); + int4 b = make_int4(0, 0, 0, 0); + int4 c = make_int4(0, 0, 0, 0); + int4 d = make_int4(0, 0, 0, 0); + + int4 src_pix = make_int4(0, 0, 0, 0); + + int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width; + + src_pix.x = tex2D<uchar>(tex, xoff.x, yoff.x); + src_pix.y = tex2D<uchar>(tex, xoff.y, yoff.y); + src_pix.z = tex2D<uchar>(tex, xoff.z, yoff.z); + src_pix.w = tex2D<uchar>(tex, xoff.w, yoff.w); + + if (!oobb) { + a = integral_img[(y - p) * width + x - p]; + b = integral_img[(y + p) * width + x - p]; + c = integral_img[(y - p) * width + x + p]; + d = integral_img[(y + p) * width + x + p]; + } + + int4 patch_diff_int = d + a - c - b; + float4 patch_diff; + patch_diff.x = (float) (-patch_diff_int.x); + patch_diff.y = (float) (-patch_diff_int.y); + patch_diff.z = (float) (-patch_diff_int.z); + patch_diff.w = (float) (-patch_diff_int.w); + + + float4 w; + w.x = exp(patch_diff.x / (h*h)); + w.y = exp(patch_diff.y / (h*h)); + w.z = exp(patch_diff.z / (h*h)); + w.w = exp(patch_diff.w / (h*h)); + + float w_sum = w.x + w.y + w.z + w.w; + weight[y * width + x] += w_sum; + + float4 src_pixf; + src_pixf.x = (float)src_pix.x; + src_pixf.y = (float)src_pix.y; + src_pixf.z = (float)src_pix.z; + src_pixf.w = (float)src_pix.w; + + sum[y * width + x] += w.x * src_pixf.x + w.y * src_pixf.y + w.z * src_pixf.z + w.w * src_pixf.w; +} + + +__global__ void nlmeans_vert_uchar( + int4 *integral_img, + int width, int height) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + + int4 sum = make_int4(0, 0, 0, 0); + + for (int i = 0; i < height; i++) { + integral_img[i * width + x] += sum; + sum = integral_img[i * width + x]; + } +} + + +__global__ void nlmeans_vert_uchar2( + int4 *integral_img, + int4 *integral_img2, + int width, int height) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + + int4 sum = make_int4(0, 0, 0, 0); + int4 sum2 = make_int4(0, 0, 0, 0); + + for (int i = 0; i < height; i++) { + integral_img[i * width + x] += sum; + sum = integral_img[i * width + x]; + + integral_img2[i * width + x] += sum2; + sum2 = integral_img2[i * width + x]; + } +} + + +__global__ void nlmeans_horiz_uchar( + int4 *integral_img, + cudaTextureObject_t tex, + int width, int height, + int4 dx, int4 dy) +{ + int y = blockIdx.y * blockDim.y + threadIdx.y; + + int4 sum = make_int4(0, 0, 0, 0); + uchar s2x, s2y, s2z, s2w; + + for (int i = 0; i < width; i++) { + uchar s1c = tex2D<uchar>(tex, i, y); + int4 s1 = make_int4(s1c, s1c, s1c, s1c); + + s2x = tex2D<uchar>(tex, i + dx.x, y + dy.x); + s2y = tex2D<uchar>(tex, i + dx.y, y + dy.y); + s2z = tex2D<uchar>(tex, i + dx.z, y + dy.z); + s2w = tex2D<uchar>(tex, i + dx.w, y + dy.w); + + int4 s2 = make_int4(s2x, s2y, s2z, s2w); + + //sum += (s1 - s2) * (s1 - s2); + int4 s3 = s1 - s2; + int4 s4; + s4.x = s3.x * s3.x; + s4.y = s3.y * s3.y; + s4.z = s3.z * s3.z; + s4.w = s3.w * s3.w; + + sum += s4; + + integral_img[y * width + i] = sum; + } +} + + +__global__ void nlmeans_horiz_uchar2( + int4 *integral_img, + int4 *integral_img2, + cudaTextureObject_t tex, + int width, int height, + int4 dx, int4 dy) +{ + int y = blockIdx.y * blockDim.y + threadIdx.y; + + int4 sum = make_int4(0, 0, 0, 0); + int4 sum2 = make_int4(0, 0, 0, 0); + uchar2 s2x, s2y, s2z, s2w; + + for (int i = 0; i < width; i++) { + uchar2 s1c = tex2D<uchar2>(tex, i, y); + int4 s1 = make_int4(s1c.x, s1c.x, s1c.x, s1c.x); + int4 r1 = make_int4(s1c.y, s1c.y, s1c.y, s1c.y); + + s2x = tex2D<uchar2>(tex, i + dx.x, y + dy.x); + s2y = tex2D<uchar2>(tex, i + dx.y, y + dy.y); + s2z = tex2D<uchar2>(tex, i + dx.z, y + dy.z); + s2w = tex2D<uchar2>(tex, i + dx.w, y + dy.w); + + int4 s2 = make_int4(s2x.x, s2y.x, s2z.x, s2w.x); + int4 r2 = make_int4(s2x.y, s2y.y, s2z.y, s2w.y); + + //sum += (s1 - s2) * (s1 - s2); + int4 s3 = s1 - s2; + int4 s4; + s4.x = s3.x * s3.x; + s4.y = s3.y * s3.y; + s4.z = s3.z * s3.z; + s4.w = s3.w * s3.w; + + sum += s4; + + //sum2 += (r1 - r2) * (r1 - r2); + int4 r3 = r1 - r2; + int4 r4; + r4.x = r3.x * r3.x; + r4.y = r3.y * r3.y; + r4.z = r3.z * r3.z; + r4.w = r3.w * r3.w; + + sum2 += r4; + + integral_img[y * width + i] = sum; + integral_img2[y * width + i] = sum2; + } +} + +}

[FFmpeg-devel,v1] lavfi: add nlmeans CUDA filter

Checks

Commit Message

Comments

Patch