diff mbox

[FFmpeg-devel] lavfi: add nlmeans_opencl filter

Message ID 20190401075234.13338-1-ruiling.song@intel.com
State New
Headers show

Commit Message

Ruiling Song April 1, 2019, 7:52 a.m. UTC
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
---
This filter runs about 2x faster on integrated GPU than nlmeans on my Skylake CPU.
Anybody like to give some comments?

Ruiling

 configure                       |   1 +
 doc/filters.texi                |   4 +
 libavfilter/Makefile            |   1 +
 libavfilter/allfilters.c        |   1 +
 libavfilter/opencl/nlmeans.cl   | 108 +++++++++
 libavfilter/opencl_source.h     |   1 +
 libavfilter/vf_nlmeans_opencl.c | 390 ++++++++++++++++++++++++++++++++
 7 files changed, 506 insertions(+)
 create mode 100644 libavfilter/opencl/nlmeans.cl
 create mode 100644 libavfilter/vf_nlmeans_opencl.c

Comments

Jun Zhao April 1, 2019, 12:01 p.m. UTC | #1
On Mon, Apr 1, 2019 at 3:53 PM Ruiling Song <ruiling.song@intel.com> wrote:

> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---
> This filter runs about 2x faster on integrated GPU than nlmeans on my
> Skylake CPU.
> Anybody like to give some comments?
>
> Ruiling
>
>  configure                       |   1 +
>  doc/filters.texi                |   4 +
>  libavfilter/Makefile            |   1 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/opencl/nlmeans.cl   | 108 +++++++++
>  libavfilter/opencl_source.h     |   1 +
>  libavfilter/vf_nlmeans_opencl.c | 390 ++++++++++++++++++++++++++++++++
>  7 files changed, 506 insertions(+)
>  create mode 100644 libavfilter/opencl/nlmeans.cl
>  create mode 100644 libavfilter/vf_nlmeans_opencl.c
>
> diff --git a/configure b/configure
> index f6123f53e5..a233512491 100755
> --- a/configure
> +++ b/configure
> @@ -3460,6 +3460,7 @@ mpdecimate_filter_select="pixelutils"
>  minterpolate_filter_select="scene_sad"
>  mptestsrc_filter_deps="gpl"
>  negate_filter_deps="lut_filter"
> +nlmeans_opencl_filter_deps="opencl"
>  nnedi_filter_deps="gpl"
>  ocr_filter_deps="libtesseract"
>  ocv_filter_deps="libopencv"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 867607d870..21c2c1a4b5 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -19030,6 +19030,10 @@ Apply erosion filter with threshold0 set to 30,
> threshold1 set 40, threshold2 se
>  @end example
>  @end itemize
>
> +@section nlmeans_opencl
> +
> +Non-local Means denoise filter through OpenCL, this filter accepts same
> options as @ref{nlmeans}.
> +
>  @section overlay_opencl
>
>  Overlay one video on top of another.
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index fef6ec5c55..92039bfdcf 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -291,6 +291,7 @@ OBJS-$(CONFIG_MIX_FILTER)                    +=
> vf_mix.o
>  OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
>  OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
>  OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
> +OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o
> opencl.o opencl/nlmeans.o
>  OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
>  OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
>  OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index c51ae0f3c7..2a6390c92d 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -277,6 +277,7 @@ extern AVFilter ff_vf_mix;
>  extern AVFilter ff_vf_mpdecimate;
>  extern AVFilter ff_vf_negate;
>  extern AVFilter ff_vf_nlmeans;
> +extern AVFilter ff_vf_nlmeans_opencl;
>  extern AVFilter ff_vf_nnedi;
>  extern AVFilter ff_vf_noformat;
>  extern AVFilter ff_vf_noise;
> diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
> new file mode 100644
> index 0000000000..dcb04834ca
> --- /dev/null
> +++ b/libavfilter/opencl/nlmeans.cl
> @@ -0,0 +1,108 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                           CLK_ADDRESS_CLAMP_TO_EDGE   |
> +                           CLK_FILTER_NEAREST);
> +
> +kernel void horiz_sum(__global uint4 *ii,
> +                      __read_only image2d_t src,
> +                      int width,
> +                      int height,
> +                      int4 dx,
> +                      int4 dy)
> +{
> +
> +    int y = get_global_id(0);
> +    int work_size = get_global_size(0);
> +
> +    uint4 sum = (uint4)(0);
> +    float4 s2;
> +    for (int i = 0; i < width; i++) {
> +        float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
> +        s2.x = read_imagef(src, sampler, (int2)(i+dx.x, y+dy.x)).x;
> +        s2.y = read_imagef(src, sampler, (int2)(i+dx.y, y+dy.y)).x;
> +        s2.z = read_imagef(src, sampler, (int2)(i+dx.z, y+dy.z)).x;
> +        s2.w = read_imagef(src, sampler, (int2)(i+dx.w, y+dy.w)).x;
> +        sum += convert_uint4((s1-s2)*(s1-s2) * 255*255);
> +        ii[y * width + i] = sum;
> +    }
> +}
> +
> +kernel void vert_sum(__global uint4 *ii,
> +                     int width,
> +                     int height)
> +{
> +    int x = get_global_id(0);
> +    uint4 sum = 0;
> +    for (int i = 0; i < height; i++) {
> +        ii[i * width + x] += sum;
> +        sum = ii[i * width + x];
> +    }
> +}
> +
> +kernel void weight_accum(global float *sum, global float *weight,
> +                         global uint4 *ii, __read_only image2d_t src,
> +                         int width, int height, int p, float h,
> +                         int4 dx, int4 dy)
> +{
> +    // w(x) = ii(x-p, y-p) + ii(x+p, y+p) - ii(x+p, y-p) - ii(x-p, y+p)
> +    // total_sum[x] += w(x, y) * src(x+dx, y+dy)
> +    // total_weight += w(x, y)
> +    int x = get_global_id(0);
> +    int y = get_global_id(1);
> +    int4 xoff = x+dx;
> +    int4 yoff = y+dy;
> +    uint4 a = 0, b = 0, c = 0, d = 0;
> +    uint4 src_pix = 0;
> +
> +    // out-of-bounding-box?
> +    int oobb = (x-p) < 0 || (y-p) < 0 || (y+p) >= height || (x+p) >=
> width;
> +
> +    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x,
> yoff.x)).x);
> +    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y,
> yoff.y)).x);
> +    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z,
> yoff.z)).x);
> +    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w,
> yoff.w)).x);
> +    if (!oobb) {
> +        a = ii[(y-p) * width + x - p];
> +        b = ii[(y + p) * width + x - p];
> +        c = ii[(y-p) * width + x + p];
> +        d = ii[(y + p) * width + x + p];
> +    }
> +
> +    float4 patch_diff = convert_float4(d + a - c - b);
> +    float4 w = native_exp(-patch_diff/(h*h));
> +    float w_sum = w.x + w.y + w.z + w.w;
> +    weight[y*width + x] += w_sum;
> +    sum[y*width + x] += dot(w, convert_float4(src_pix));
> +}
> +
> +kernel void average(__write_only image2d_t dst,
> +                    __read_only image2d_t src,
> +                    global float *sum, global float *weight) {
> +    int x = get_global_id(0);
> +    int y = get_global_id(1);
> +    int2 dim = get_image_dim(dst);
> +
> +    float w = weight[y * dim.x + x];
> +    float s = sum[y*dim.x + x];
> +    float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
> +    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
> +    if (x < dim.x && y < dim.y)
> +        write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
> +}
> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 4118138c30..fd40fd7dca 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -23,6 +23,7 @@ extern const char *ff_opencl_source_avgblur;
>  extern const char *ff_opencl_source_colorspace_common;
>  extern const char *ff_opencl_source_convolution;
>  extern const char *ff_opencl_source_neighbor;
> +extern const char *ff_opencl_source_nlmeans;
>  extern const char *ff_opencl_source_overlay;
>  extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_transpose;
> diff --git a/libavfilter/vf_nlmeans_opencl.c
> b/libavfilter/vf_nlmeans_opencl.c
> new file mode 100644
> index 0000000000..0cc9af10da
> --- /dev/null
> +++ b/libavfilter/vf_nlmeans_opencl.c
> @@ -0,0 +1,390 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +#include <float.h>
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/common.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_YUV444P,
> +    AV_PIX_FMT_GBRP,
> +};
> +
> +static int is_format_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +typedef struct NLMeansOpenCLContext {
> +    OpenCLFilterContext   ocf;
> +    int                   initialised;
> +    cl_kernel             vert_kernel;
> +    cl_kernel             horiz_kernel;
> +    cl_kernel             accum_kernel;
> +    cl_kernel             average_kernel;
> +    double                sigma;
> +    float                 h;
> +    int                   chroma_w;
> +    int                   chroma_h;
> +    int                   patch_size;
> +    int                   patch_size_uv;
> +    int                   research_size;
> +    int                   research_size_uv;
> +    cl_command_queue      command_queue;
> +} NLMeansOpenCLContext;
> +
> +static int nlmeans_opencl_init(AVFilterContext *avctx)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +    int err;
> +
> +    ctx->h = ctx->sigma * 10;
> +    if (!ctx->research_size_uv)
> +        ctx->research_size_uv = ctx->research_size;
> +    if (!ctx->patch_size_uv)
> +        ctx->patch_size_uv = ctx->patch_size;
> +
> +    err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_nlmeans,
> 1);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
> +                     "command queue %d.\n", cle);
> +
> +    ctx->vert_kernel = clCreateKernel(ctx->ocf.program, "vert_sum", &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create vert_sum kernel
> %d.\n", cle);
> +
> +    ctx->horiz_kernel = clCreateKernel(ctx->ocf.program, "horiz_sum",
> &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create horiz_sum kernel
> %d.\n", cle);
> +
> +    ctx->accum_kernel = clCreateKernel(ctx->ocf.program, "weight_accum",
> &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create accum kernel %d.\n",
> cle);
> +
> +    ctx->average_kernel = clCreateKernel(ctx->ocf.program, "average",
> &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create average kernel
> %d.\n", cle);
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->vert_kernel)
> +        clReleaseKernel(ctx->vert_kernel);
> +    if (ctx->horiz_kernel)
> +        clReleaseKernel(ctx->horiz_kernel);
> +    if (ctx->accum_kernel)
> +        clReleaseKernel(ctx->accum_kernel);
> +    if (ctx->average_kernel)
> +        clReleaseKernel(ctx->average_kernel);
> +    return err;
> +}
> +
> +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
> +                         int w, int h, int p, int r)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    const float zero = 0.0f;
> +    const size_t worksize1[] = {h};
> +    const size_t worksize2[] = {w};
> +    const size_t worksize3[2] = {w, h};
> +    int dx, dy, err = 0, weight_buf_size;
> +    cl_mem ii, weight, sum;
> +    cl_int cle;
> +    int nb_pixel, *tmp, *dxdy, idx = 0;
> +
> +    weight_buf_size = w * h * sizeof(int);
> +    ii = clCreateBuffer(ctx->ocf.hwctx->context, 0, 4 * weight_buf_size,
> +                               NULL, &cle);
> +    weight = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,
> +                               NULL, &cle);
> +    sum = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,
> +                               NULL, &cle);
> +    cle = clEnqueueFillBuffer(ctx->command_queue, weight, &zero,
> sizeof(float),
> +                              0, weight_buf_size, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
> +                     cle);
> +    cle = clEnqueueFillBuffer(ctx->command_queue, sum, &zero,
> sizeof(float),
> +                              0, weight_buf_size, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
> +                     cle);
> +
> +    nb_pixel = (2*r+1) *(2*r+1)-1;
> +    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
> +    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
> +
> +    if (!dxdy || !tmp)
> +        goto fail;
> +
> +    for (dx = -r; dx <= r; dx++) {
> +        for (dy = -r; dy <= r; dy++) {
> +            if (dx || dy) {
> +                tmp[idx++] = dx;
> +                tmp[idx++] = dy;
> +            }
> +        }
> +    }
> +    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a
> batch
> +    for (int i = 0; i < nb_pixel/4;i++) {
> +        dxdy[i * 8] = tmp[i * 8];         // dx0
> +        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
> +        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
> +        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
> +        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
> +        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
> +        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
> +        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
> +    }
> +    av_freep(&tmp);
> +
> +    for (int i = 0; i < nb_pixel / 4; i++) {
> +        int *dx_cur = dxdy + 8 * i;
> +        int *dy_cur = dxdy + 8 * i + 4;
> +
> +        // ii(x,y) = sum of [u(i,y) - u(i+dx,y+dy)]^2 for all i < x
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ii);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &w);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &h);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue,
> ctx->horiz_kernel, 1,
> +                               NULL, worksize1, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel:
> %d.\n",
> +                         cle);
> +
> +        // ii(x,y) = ii(x,0) + ii(x,1) +...+ ii(x,y-1)
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ii);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &w);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &h);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
> +                                     1, NULL, worksize2, NULL, 0, NULL,
> NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel:
> %d.\n",
> +                         cle);
> +
> +        // accumlate weights
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &sum);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &weight);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ii);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &w);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &h);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue,
> ctx->accum_kernel,
> +                                     2, NULL, worksize3, NULL, 0, NULL,
> NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n",
> cle);
> +    }
> +    av_freep(&dxdy);
> +
> +    // average
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &sum);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &weight);
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->average_kernel,
> 2,
> +                                 NULL, worksize3, NULL, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average kernel:
> %d.\n",
> +                     cle);
> +    cle = clFinish(ctx->command_queue);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
> +fail:
> +    if (tmp)
> +      av_freep(&tmp);
> +    if (dxdy)
> +      av_freep(&dxdy);
> +    clFinish(ctx->command_queue);
> +    clReleaseMemObject(ii);
> +    clReleaseMemObject(weight);
> +    clReleaseMemObject(sum);
> +    return err;
> +}
> +
> +static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame
> *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    AVHWFramesContext *input_frames_ctx;
> +    const AVPixFmtDescriptor *desc;
> +    enum AVPixelFormat in_format;
> +    cl_mem src, dst;
> +    int w, h, err, p, patch, research;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;
> +    in_format = input_frames_ctx->sw_format;
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!ctx->initialised) {
> +        desc = av_pix_fmt_desc_get(in_format);
> +        if (!is_format_supported(in_format)) {
> +            err = AVERROR(EINVAL);
> +            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",
> +                   av_get_pix_fmt_name(in_format));
> +            goto fail;
> +        }
> +        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
> +        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
> +
> +        err = nlmeans_opencl_init(avctx);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
> +        src = (cl_mem) input->data[p];
> +        dst = (cl_mem) output->data[p];
> +
> +        if (!dst)
> +            break;
> +        w = p ? ctx->chroma_w : inlink->w;
> +        h = p ? ctx->chroma_h : inlink->h;
> +        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;
> +        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;
> +        err = nlmeans_plane(avctx, dst, src, w, h, patch, research);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    av_frame_free(&input);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +#define RELEASE_KERNEL(k)                                    \
> +do {                                                         \
> +    if (k) {                                                 \
> +        cle = clReleaseKernel(k);                            \
> +        if (cle != CL_SUCCESS)                               \
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release " \
> +                   "kernel: %d.\n", cle);                    \
> +    }                                                        \
> +} while(0)
> +
> +static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    RELEASE_KERNEL(ctx->vert_kernel);
> +    RELEASE_KERNEL(ctx->horiz_kernel);
> +    RELEASE_KERNEL(ctx->accum_kernel);
> +    RELEASE_KERNEL(ctx->average_kernel);
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}
> +
> +#define OFFSET(x) offsetof(NLMeansOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption nlmeans_opencl_options[] = {
> +    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, {
> .dbl = 1.0 }, 1.0, 30.0, FLAGS },
> +    { "p",  "patch size",                   OFFSET(patch_size),
> AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
> +    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv),
> AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
> +    { "r",  "research window",                   OFFSET(research_size),
>   AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
> +    { "rc", "research window for chroma planes",
> OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(nlmeans_opencl);
> +
> +static const AVFilterPad nlmeans_opencl_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = &nlmeans_opencl_filter_frame,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad nlmeans_opencl_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &ff_opencl_filter_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_nlmeans_opencl = {
> +    .name           = "nlmeans_opencl",
> +    .description    = NULL_IF_CONFIG_SMALL("Non-local means denoiser
> through OpenCL"),
> +    .priv_size      = sizeof(NLMeansOpenCLContext),
> +    .priv_class     = &nlmeans_opencl_class,
> +    .init           = &ff_opencl_filter_init,
> +    .uninit         = &nlmeans_opencl_uninit,
> +    .query_formats  = &ff_opencl_filter_query_formats,
> +    .inputs         = nlmeans_opencl_inputs,
> +    .outputs        = nlmeans_opencl_outputs,
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
>
> Can you supply some details performance data ?
Ruiling Song April 2, 2019, 2:16 a.m. UTC | #2
> Can you supply some details performance data ? 


On my i7-6770HQ, the nlmeans take 1.2s to process one 1080p frame.
And nlmeans_opencl take 500ms to process one frame.

Ruiling
Ruiling Song April 8, 2019, 1:33 a.m. UTC | #3
> -----Original Message-----
> From: Song, Ruiling
> Sent: Monday, April 1, 2019 3:53 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Song, Ruiling <ruiling.song@intel.com>
> Subject: [PATCH] lavfi: add nlmeans_opencl filter
> 
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---
> This filter runs about 2x faster on integrated GPU than nlmeans on my Skylake
> CPU.
> Anybody like to give some comments?

Ping?
Jun Zhao April 8, 2019, 1:37 a.m. UTC | #4
On Mon, Apr 8, 2019 at 9:33 AM Song, Ruiling <ruiling.song@intel.com> wrote:
>
> > -----Original Message-----
> > From: Song, Ruiling
> > Sent: Monday, April 1, 2019 3:53 PM
> > To: ffmpeg-devel@ffmpeg.org
> > Cc: Song, Ruiling <ruiling.song@intel.com>
> > Subject: [PATCH] lavfi: add nlmeans_opencl filter
> >
> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> > ---
> > This filter runs about 2x faster on integrated GPU than nlmeans on my Skylake
> > CPU.
> > Anybody like to give some comments?
>
> Ping?
>
Tested and verified in i5-8265U

OpenCL CPU/pocl 1.2fps with 1080P input
OpenCL GPU/intel NEO 1.2 fps with 1080P input
Ruiling Song April 8, 2019, 2:03 a.m. UTC | #5
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> mypopy@gmail.com

> Sent: Monday, April 8, 2019 9:37 AM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add nlmeans_opencl filter

> 

> On Mon, Apr 8, 2019 at 9:33 AM Song, Ruiling <ruiling.song@intel.com> wrote:

> >

> > > -----Original Message-----

> > > From: Song, Ruiling

> > > Sent: Monday, April 1, 2019 3:53 PM

> > > To: ffmpeg-devel@ffmpeg.org

> > > Cc: Song, Ruiling <ruiling.song@intel.com>

> > > Subject: [PATCH] lavfi: add nlmeans_opencl filter

> > >

> > > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > > ---

> > > This filter runs about 2x faster on integrated GPU than nlmeans on my

> Skylake

> > > CPU.

> > > Anybody like to give some comments?

> >

> > Ping?

> >

> Tested and verified in i5-8265U


Thanks for the testing. And comments about the code itself are welcome.
The performance data highly depend on the research-window parameters and also the hardware.
I think you may play-with the parameters to make a trade-off between speed and quality.

Thanks!
Ruiling
> 

> OpenCL CPU/pocl 1.2fps with 1080P input

> OpenCL GPU/intel NEO 1.2 fps with 1080P input

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Mark Thompson April 8, 2019, 8:25 p.m. UTC | #6
On 01/04/2019 08:52, Ruiling Song wrote:
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---
> This filter runs about 2x faster on integrated GPU than nlmeans on my Skylake CPU.
> Anybody like to give some comments?

Nice!

>  configure                       |   1 +
>  doc/filters.texi                |   4 +
>  libavfilter/Makefile            |   1 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/opencl/nlmeans.cl   | 108 +++++++++
>  libavfilter/opencl_source.h     |   1 +
>  libavfilter/vf_nlmeans_opencl.c | 390 ++++++++++++++++++++++++++++++++
>  7 files changed, 506 insertions(+)
>  create mode 100644 libavfilter/opencl/nlmeans.cl
>  create mode 100644 libavfilter/vf_nlmeans_opencl.c
> 
> diff --git a/configure b/configure
> index f6123f53e5..a233512491 100755
> --- a/configure
> +++ b/configure
> @@ -3460,6 +3460,7 @@ mpdecimate_filter_select="pixelutils"
>  minterpolate_filter_select="scene_sad"
>  mptestsrc_filter_deps="gpl"
>  negate_filter_deps="lut_filter"
> +nlmeans_opencl_filter_deps="opencl"
>  nnedi_filter_deps="gpl"
>  ocr_filter_deps="libtesseract"
>  ocv_filter_deps="libopencv"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 867607d870..21c2c1a4b5 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -19030,6 +19030,10 @@ Apply erosion filter with threshold0 set to 30, threshold1 set 40, threshold2 se
>  @end example
>  @end itemize
>  
> +@section nlmeans_opencl
> +
> +Non-local Means denoise filter through OpenCL, this filter accepts same options as @ref{nlmeans}.
> +
>  @section overlay_opencl
>  
>  Overlay one video on top of another.
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index fef6ec5c55..92039bfdcf 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -291,6 +291,7 @@ OBJS-$(CONFIG_MIX_FILTER)                    += vf_mix.o
>  OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
>  OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
>  OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
> +OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
>  OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
>  OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
>  OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index c51ae0f3c7..2a6390c92d 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -277,6 +277,7 @@ extern AVFilter ff_vf_mix;
>  extern AVFilter ff_vf_mpdecimate;
>  extern AVFilter ff_vf_negate;
>  extern AVFilter ff_vf_nlmeans;
> +extern AVFilter ff_vf_nlmeans_opencl;
>  extern AVFilter ff_vf_nnedi;
>  extern AVFilter ff_vf_noformat;
>  extern AVFilter ff_vf_noise;
> diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
> new file mode 100644
> index 0000000000..dcb04834ca
> --- /dev/null
> +++ b/libavfilter/opencl/nlmeans.cl
> @@ -0,0 +1,108 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                           CLK_ADDRESS_CLAMP_TO_EDGE   |
> +                           CLK_FILTER_NEAREST);
> +
> +kernel void horiz_sum(__global uint4 *ii,
> +                      __read_only image2d_t src,
> +                      int width,
> +                      int height,
> +                      int4 dx,
> +                      int4 dy)
> +{
> +
> +    int y = get_global_id(0);
> +    int work_size = get_global_size(0);
> +
> +    uint4 sum = (uint4)(0);
> +    float4 s2;
> +    for (int i = 0; i < width; i++) {
> +        float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
> +        s2.x = read_imagef(src, sampler, (int2)(i+dx.x, y+dy.x)).x;
> +        s2.y = read_imagef(src, sampler, (int2)(i+dx.y, y+dy.y)).x;
> +        s2.z = read_imagef(src, sampler, (int2)(i+dx.z, y+dy.z)).x;
> +        s2.w = read_imagef(src, sampler, (int2)(i+dx.w, y+dy.w)).x;
> +        sum += convert_uint4((s1-s2)*(s1-s2) * 255*255);
> +        ii[y * width + i] = sum;
> +    }
> +}
> +
> +kernel void vert_sum(__global uint4 *ii,
> +                     int width,
> +                     int height)
> +{
> +    int x = get_global_id(0);
> +    uint4 sum = 0;
> +    for (int i = 0; i < height; i++) {
> +        ii[i * width + x] += sum;
> +        sum = ii[i * width + x];

This looks like it might be able to overflow in extreme cases?

3840 * 2160 * (1 - 0)^2 * 255 * 255 = 539,343,360,000 which is a long way out of range for a 32-bit int.  That requires impossible input (all pixels differing by the most extreme value), but something like a chequerboard might be of the same order?

> +    }
> +}
> +
> +kernel void weight_accum(global float *sum, global float *weight,
> +                         global uint4 *ii, __read_only image2d_t src,
> +                         int width, int height, int p, float h,
> +                         int4 dx, int4 dy)
> +{
> +    // w(x) = ii(x-p, y-p) + ii(x+p, y+p) - ii(x+p, y-p) - ii(x-p, y+p)
> +    // total_sum[x] += w(x, y) * src(x+dx, y+dy)
> +    // total_weight += w(x, y)
> +    int x = get_global_id(0);
> +    int y = get_global_id(1);
> +    int4 xoff = x+dx;
> +    int4 yoff = y+dy;
> +    uint4 a = 0, b = 0, c = 0, d = 0;
> +    uint4 src_pix = 0;
> +
> +    // out-of-bounding-box?
> +    int oobb = (x-p) < 0 || (y-p) < 0 || (y+p) >= height || (x+p) >= width;
> +
> +    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
> +    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
> +    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
> +    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
> +    if (!oobb) {
> +        a = ii[(y-p) * width + x - p];
> +        b = ii[(y + p) * width + x - p];
> +        c = ii[(y-p) * width + x + p];
> +        d = ii[(y + p) * width + x + p];
> +    }
> +
> +    float4 patch_diff = convert_float4(d + a - c - b);
> +    float4 w = native_exp(-patch_diff/(h*h));
> +    float w_sum = w.x + w.y + w.z + w.w;
> +    weight[y*width + x] += w_sum;
> +    sum[y*width + x] += dot(w, convert_float4(src_pix));
> +}

I feel like the global ii buffer in all of these functions could be given a clearer name based on what it's doing in that function.

> +
> +kernel void average(__write_only image2d_t dst,
> +                    __read_only image2d_t src,
> +                    global float *sum, global float *weight) {
> +    int x = get_global_id(0);
> +    int y = get_global_id(1);
> +    int2 dim = get_image_dim(dst);
> +
> +    float w = weight[y * dim.x + x];
> +    float s = sum[y*dim.x + x];
> +    float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
> +    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
> +    if (x < dim.x && y < dim.y)
> +        write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
> +}
> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 4118138c30..fd40fd7dca 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -23,6 +23,7 @@ extern const char *ff_opencl_source_avgblur;
>  extern const char *ff_opencl_source_colorspace_common;
>  extern const char *ff_opencl_source_convolution;
>  extern const char *ff_opencl_source_neighbor;
> +extern const char *ff_opencl_source_nlmeans;
>  extern const char *ff_opencl_source_overlay;
>  extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_transpose;
> diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
> new file mode 100644
> index 0000000000..0cc9af10da
> --- /dev/null
> +++ b/libavfilter/vf_nlmeans_opencl.c
> @@ -0,0 +1,390 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include <float.h>
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/common.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_YUV444P,
> +    AV_PIX_FMT_GBRP,

YUV420P16, YUV444P16, GBRP16 probably also work with no change?

> +};
> +
> +static int is_format_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +typedef struct NLMeansOpenCLContext {
> +    OpenCLFilterContext   ocf;
> +    int                   initialised;
> +    cl_kernel             vert_kernel;
> +    cl_kernel             horiz_kernel;
> +    cl_kernel             accum_kernel;
> +    cl_kernel             average_kernel;
> +    double                sigma;
> +    float                 h;
> +    int                   chroma_w;
> +    int                   chroma_h;
> +    int                   patch_size;
> +    int                   patch_size_uv;
> +    int                   research_size;
> +    int                   research_size_uv;
> +    cl_command_queue      command_queue;
> +} NLMeansOpenCLContext;
> +
> +static int nlmeans_opencl_init(AVFilterContext *avctx)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +    int err;
> +
> +    ctx->h = ctx->sigma * 10;
> +    if (!ctx->research_size_uv)
> +        ctx->research_size_uv = ctx->research_size;
> +    if (!ctx->patch_size_uv)
> +        ctx->patch_size_uv = ctx->patch_size;
> +
> +    err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_nlmeans, 1);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
> +                     "command queue %d.\n", cle);
> +
> +    ctx->vert_kernel = clCreateKernel(ctx->ocf.program, "vert_sum", &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create vert_sum kernel %d.\n", cle);
> +
> +    ctx->horiz_kernel = clCreateKernel(ctx->ocf.program, "horiz_sum", &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create horiz_sum kernel %d.\n", cle);
> +
> +    ctx->accum_kernel = clCreateKernel(ctx->ocf.program, "weight_accum", &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create accum kernel %d.\n", cle);
> +
> +    ctx->average_kernel = clCreateKernel(ctx->ocf.program, "average", &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create average kernel %d.\n", cle);
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->vert_kernel)
> +        clReleaseKernel(ctx->vert_kernel);
> +    if (ctx->horiz_kernel)
> +        clReleaseKernel(ctx->horiz_kernel);
> +    if (ctx->accum_kernel)
> +        clReleaseKernel(ctx->accum_kernel);
> +    if (ctx->average_kernel)
> +        clReleaseKernel(ctx->average_kernel);
> +    return err;
> +}
> +
> +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
> +                         int w, int h, int p, int r)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    const float zero = 0.0f;
> +    const size_t worksize1[] = {h};
> +    const size_t worksize2[] = {w};
> +    const size_t worksize3[2] = {w, h};
> +    int dx, dy, err = 0, weight_buf_size;
> +    cl_mem ii, weight, sum;
> +    cl_int cle;
> +    int nb_pixel, *tmp, *dxdy, idx = 0;

I think some of these should be cl_int since they are going to be used on the device side.

> +
> +    weight_buf_size = w * h * sizeof(int);

sizeof(cl_int)

> +    ii = clCreateBuffer(ctx->ocf.hwctx->context, 0, 4 * weight_buf_size,
> +                               NULL, &cle);
> +    weight = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,
> +                               NULL, &cle);
> +    sum = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,
> +                               NULL, &cle);

These allocations are unchecked.

The sizes shouldn't vary - is there any benefit to allocating the buffers once and them keeping them across iterations?  (Maybe it's swamped by other operations being slow anyway.)

> +    cle = clEnqueueFillBuffer(ctx->command_queue, weight, &zero, sizeof(float),
> +                              0, weight_buf_size, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
> +                     cle);
> +    cle = clEnqueueFillBuffer(ctx->command_queue, sum, &zero, sizeof(float),
> +                              0, weight_buf_size, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
> +                     cle);
> +
> +    nb_pixel = (2*r+1) *(2*r+1)-1;

Spaces around binary operators, please.  (Also in some places below.)

> +    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
> +    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
> +
> +    if (!dxdy || !tmp)
> +        goto fail;
> +
> +    for (dx = -r; dx <= r; dx++) {
> +        for (dy = -r; dy <= r; dy++) {
> +            if (dx || dy) {
> +                tmp[idx++] = dx;
> +                tmp[idx++] = dy;
> +            }
> +        }
> +    }
> +    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
> +    for (int i = 0; i < nb_pixel/4;i++) {
> +        dxdy[i * 8] = tmp[i * 8];         // dx0
> +        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
> +        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
> +        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
> +        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
> +        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
> +        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
> +        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
> +    }
> +    av_freep(&tmp);
> +
> +    for (int i = 0; i < nb_pixel / 4; i++) {
> +        int *dx_cur = dxdy + 8 * i;
> +        int *dy_cur = dxdy + 8 * i + 4;
> +
> +        // ii(x,y) = sum of [u(i,y) - u(i+dx,y+dy)]^2 for all i < x
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ii);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &w);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &h);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel, 1,
> +                               NULL, worksize1, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel: %d.\n",
> +                         cle);
> +
> +        // ii(x,y) = ii(x,0) + ii(x,1) +...+ ii(x,y-1)
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ii);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &w);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &h);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
> +                                     1, NULL, worksize2, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
> +                         cle);
> +
> +        // accumlate weights
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &sum);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &weight);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ii);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &w);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &h);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->accum_kernel,
> +                                     2, NULL, worksize3, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
> +    }
> +    av_freep(&dxdy);
> +
> +    // average
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &sum);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &weight);
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->average_kernel, 2,
> +                                 NULL, worksize3, NULL, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average kernel: %d.\n",
> +                     cle);
> +    cle = clFinish(ctx->command_queue);

Excepting the destruction of the memory objects, is there any reason it needs the clFinish() here rather than once all planes have been processed?

(More generally, it feels like there should be some more parallelism extractable here.)

> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
> +fail:
> +    if (tmp)
> +      av_freep(&tmp);
> +    if (dxdy)
> +      av_freep(&dxdy);
> +    clFinish(ctx->command_queue);
> +    clReleaseMemObject(ii);
> +    clReleaseMemObject(weight);
> +    clReleaseMemObject(sum);
> +    return err;
> +}
> +
> +static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    AVHWFramesContext *input_frames_ctx;
> +    const AVPixFmtDescriptor *desc;
> +    enum AVPixelFormat in_format;
> +    cl_mem src, dst;
> +    int w, h, err, p, patch, research;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;
> +    in_format = input_frames_ctx->sw_format;
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!ctx->initialised) {
> +        desc = av_pix_fmt_desc_get(in_format);
> +        if (!is_format_supported(in_format)) {
> +            err = AVERROR(EINVAL);
> +            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",
> +                   av_get_pix_fmt_name(in_format));
> +            goto fail;
> +        }
> +        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
> +        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
> +
> +        err = nlmeans_opencl_init(avctx);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
> +        src = (cl_mem) input->data[p];
> +        dst = (cl_mem) output->data[p];
> +
> +        if (!dst)
> +            break;
> +        w = p ? ctx->chroma_w : inlink->w;
> +        h = p ? ctx->chroma_h : inlink->h;
> +        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;
> +        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;
> +        err = nlmeans_plane(avctx, dst, src, w, h, patch, research);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    av_frame_free(&input);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +#define RELEASE_KERNEL(k)                                    \
> +do {                                                         \
> +    if (k) {                                                 \
> +        cle = clReleaseKernel(k);                            \
> +        if (cle != CL_SUCCESS)                               \
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release " \
> +                   "kernel: %d.\n", cle);                    \
> +    }                                                        \
> +} while(0)

This appears multiple times here and also in other filters.  Maybe it should be a macro in opencl.h like CL_SET_KERNEL_ARG?

> +
> +static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    RELEASE_KERNEL(ctx->vert_kernel);
> +    RELEASE_KERNEL(ctx->horiz_kernel);
> +    RELEASE_KERNEL(ctx->accum_kernel);
> +    RELEASE_KERNEL(ctx->average_kernel);
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}
> +
> +#define OFFSET(x) offsetof(NLMeansOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption nlmeans_opencl_options[] = {
> +    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
> +    { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
> +    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
> +    { "r",  "research window",                   OFFSET(research_size),    AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
> +    { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },

There is an assumption that the size arguments are odd integers, but it isn't checked anywhere.

> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(nlmeans_opencl);
> +
> +static const AVFilterPad nlmeans_opencl_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = &nlmeans_opencl_filter_frame,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad nlmeans_opencl_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &ff_opencl_filter_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_nlmeans_opencl = {
> +    .name           = "nlmeans_opencl",
> +    .description    = NULL_IF_CONFIG_SMALL("Non-local means denoiser through OpenCL"),
> +    .priv_size      = sizeof(NLMeansOpenCLContext),
> +    .priv_class     = &nlmeans_opencl_class,
> +    .init           = &ff_opencl_filter_init,
> +    .uninit         = &nlmeans_opencl_uninit,
> +    .query_formats  = &ff_opencl_filter_query_formats,
> +    .inputs         = nlmeans_opencl_inputs,
> +    .outputs        = nlmeans_opencl_outputs,
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> 

I got good results compared to CPU nlmeans (1.4x speedup) on the GPU vs. CPU of a CFL 8700 with Beignet, even including the upload/download overhead.  Have you tried it on any larger GPU?  Can it get much faster with more processing power, or does something else limit it?

Thanks,

- Mark
Ruiling Song April 9, 2019, 2:54 a.m. UTC | #7
Thanks for the valuable comments!

> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Mark Thompson

> Sent: Tuesday, April 9, 2019 4:26 AM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add nlmeans_opencl filter

> 

> On 01/04/2019 08:52, Ruiling Song wrote:

> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > ---

> > This filter runs about 2x faster on integrated GPU than nlmeans on my Skylake

> CPU.

> > Anybody like to give some comments?

> 

> Nice!

> 

> >  configure                       |   1 +

> >  doc/filters.texi                |   4 +

> >  libavfilter/Makefile            |   1 +

> >  libavfilter/allfilters.c        |   1 +

> >  libavfilter/opencl/nlmeans.cl   | 108 +++++++++

> >  libavfilter/opencl_source.h     |   1 +

> >  libavfilter/vf_nlmeans_opencl.c | 390 ++++++++++++++++++++++++++++++++

> >  7 files changed, 506 insertions(+)

> >  create mode 100644 libavfilter/opencl/nlmeans.cl

> >  create mode 100644 libavfilter/vf_nlmeans_opencl.c

> >

> > diff --git a/configure b/configure

> > index f6123f53e5..a233512491 100755

> > --- a/configure

> > +++ b/configure

> > @@ -3460,6 +3460,7 @@ mpdecimate_filter_select="pixelutils"

> >  minterpolate_filter_select="scene_sad"

> >  mptestsrc_filter_deps="gpl"

> >  negate_filter_deps="lut_filter"

> > +nlmeans_opencl_filter_deps="opencl"

> >  nnedi_filter_deps="gpl"

> >  ocr_filter_deps="libtesseract"

> >  ocv_filter_deps="libopencv"

> > diff --git a/doc/filters.texi b/doc/filters.texi

> > index 867607d870..21c2c1a4b5 100644

> > --- a/doc/filters.texi

> > +++ b/doc/filters.texi

> > @@ -19030,6 +19030,10 @@ Apply erosion filter with threshold0 set to 30,

> threshold1 set 40, threshold2 se

> >  @end example

> >  @end itemize

> >

> > +@section nlmeans_opencl

> > +

> > +Non-local Means denoise filter through OpenCL, this filter accepts same

> options as @ref{nlmeans}.

> > +

> >  @section overlay_opencl

> >

> >  Overlay one video on top of another.

> > diff --git a/libavfilter/Makefile b/libavfilter/Makefile

> > index fef6ec5c55..92039bfdcf 100644

> > --- a/libavfilter/Makefile

> > +++ b/libavfilter/Makefile

> > @@ -291,6 +291,7 @@ OBJS-$(CONFIG_MIX_FILTER)                    += vf_mix.o

> >  OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o

> >  OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o

> >  OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o

> > +OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o

> opencl.o opencl/nlmeans.o

> >  OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o

> >  OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o

> >  OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o

> > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c

> > index c51ae0f3c7..2a6390c92d 100644

> > --- a/libavfilter/allfilters.c

> > +++ b/libavfilter/allfilters.c

> > @@ -277,6 +277,7 @@ extern AVFilter ff_vf_mix;

> >  extern AVFilter ff_vf_mpdecimate;

> >  extern AVFilter ff_vf_negate;

> >  extern AVFilter ff_vf_nlmeans;

> > +extern AVFilter ff_vf_nlmeans_opencl;

> >  extern AVFilter ff_vf_nnedi;

> >  extern AVFilter ff_vf_noformat;

> >  extern AVFilter ff_vf_noise;

> > diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl

> > new file mode 100644

> > index 0000000000..dcb04834ca

> > --- /dev/null

> > +++ b/libavfilter/opencl/nlmeans.cl

> > @@ -0,0 +1,108 @@

> > +/*

> > + * This file is part of FFmpeg.

> > + *

> > + * FFmpeg is free software; you can redistribute it and/or

> > + * modify it under the terms of the GNU Lesser General Public

> > + * License as published by the Free Software Foundation; either

> > + * version 2.1 of the License, or (at your option) any later version.

> > + *

> > + * FFmpeg is distributed in the hope that it will be useful,

> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of

> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

> > + * Lesser General Public License for more details.

> > + *

> > + * You should have received a copy of the GNU Lesser General Public

> > + * License along with FFmpeg; if not, write to the Free Software

> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301

> USA

> > + */

> > +

> > +const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |

> > +                           CLK_ADDRESS_CLAMP_TO_EDGE   |

> > +                           CLK_FILTER_NEAREST);

> > +

> > +kernel void horiz_sum(__global uint4 *ii,

> > +                      __read_only image2d_t src,

> > +                      int width,

> > +                      int height,

> > +                      int4 dx,

> > +                      int4 dy)

> > +{

> > +

> > +    int y = get_global_id(0);

> > +    int work_size = get_global_size(0);

> > +

> > +    uint4 sum = (uint4)(0);

> > +    float4 s2;

> > +    for (int i = 0; i < width; i++) {

> > +        float s1 = read_imagef(src, sampler, (int2)(i, y)).x;

> > +        s2.x = read_imagef(src, sampler, (int2)(i+dx.x, y+dy.x)).x;

> > +        s2.y = read_imagef(src, sampler, (int2)(i+dx.y, y+dy.y)).x;

> > +        s2.z = read_imagef(src, sampler, (int2)(i+dx.z, y+dy.z)).x;

> > +        s2.w = read_imagef(src, sampler, (int2)(i+dx.w, y+dy.w)).x;

> > +        sum += convert_uint4((s1-s2)*(s1-s2) * 255*255);

> > +        ii[y * width + i] = sum;

> > +    }

> > +}

> > +

> > +kernel void vert_sum(__global uint4 *ii,

> > +                     int width,

> > +                     int height)

> > +{

> > +    int x = get_global_id(0);

> > +    uint4 sum = 0;

> > +    for (int i = 0; i < height; i++) {

> > +        ii[i * width + x] += sum;

> > +        sum = ii[i * width + x];

> 

> This looks like it might be able to overflow in extreme cases?

> 

> 3840 * 2160 * (1 - 0)^2 * 255 * 255 = 539,343,360,000 which is a long way out of

> range for a 32-bit int.  That requires impossible input (all pixels differing by the

> most extreme value), but something like a chequerboard might be of the same

> order?

Yes this is a dilemma for me. Generally the filter is with high computation cost.
To fix the overflow, we have to use 64bit integer for the integral image.
Most GPUs are not good at 64bit integer calculation I think. May be we can try later.
So I would prefer to stay with 32bit integer for a while.
I just test against some 1080p source, and does not observe the overflow. Yes theoretically, it would possible.
Do you mean checkerboard may be more possible to overflow? I can have a test.
 
> 

> > +    }

> > +}

> > +

> > +kernel void weight_accum(global float *sum, global float *weight,

> > +                         global uint4 *ii, __read_only image2d_t src,

> > +                         int width, int height, int p, float h,

> > +                         int4 dx, int4 dy)

> > +{

> > +    // w(x) = ii(x-p, y-p) + ii(x+p, y+p) - ii(x+p, y-p) - ii(x-p, y+p)

> > +    // total_sum[x] += w(x, y) * src(x+dx, y+dy)

> > +    // total_weight += w(x, y)

> > +    int x = get_global_id(0);

> > +    int y = get_global_id(1);

> > +    int4 xoff = x+dx;

> > +    int4 yoff = y+dy;

> > +    uint4 a = 0, b = 0, c = 0, d = 0;

> > +    uint4 src_pix = 0;

> > +

> > +    // out-of-bounding-box?

> > +    int oobb = (x-p) < 0 || (y-p) < 0 || (y+p) >= height || (x+p) >= width;

> > +

> > +    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);

> > +    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);

> > +    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);

> > +    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);

> > +    if (!oobb) {

> > +        a = ii[(y-p) * width + x - p];

> > +        b = ii[(y + p) * width + x - p];

> > +        c = ii[(y-p) * width + x + p];

> > +        d = ii[(y + p) * width + x + p];

> > +    }

> > +

> > +    float4 patch_diff = convert_float4(d + a - c - b);

> > +    float4 w = native_exp(-patch_diff/(h*h));

> > +    float w_sum = w.x + w.y + w.z + w.w;

> > +    weight[y*width + x] += w_sum;

> > +    sum[y*width + x] += dot(w, convert_float4(src_pix));

> > +}

> 

> I feel like the global ii buffer in all of these functions could be given a clearer

> name based on what it's doing in that function.

I think I can change it to "integral" or "integral_img" which is a bit clearer.

> 

> > +

> > +kernel void average(__write_only image2d_t dst,

> > +                    __read_only image2d_t src,

> > +                    global float *sum, global float *weight) {

> > +    int x = get_global_id(0);

> > +    int y = get_global_id(1);

> > +    int2 dim = get_image_dim(dst);

> > +

> > +    float w = weight[y * dim.x + x];

> > +    float s = sum[y*dim.x + x];

> > +    float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;

> > +    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;

> > +    if (x < dim.x && y < dim.y)

> > +        write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));

> > +}

> > diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h

> > index 4118138c30..fd40fd7dca 100644

> > --- a/libavfilter/opencl_source.h

> > +++ b/libavfilter/opencl_source.h

> > @@ -23,6 +23,7 @@ extern const char *ff_opencl_source_avgblur;

> >  extern const char *ff_opencl_source_colorspace_common;

> >  extern const char *ff_opencl_source_convolution;

> >  extern const char *ff_opencl_source_neighbor;

> > +extern const char *ff_opencl_source_nlmeans;

> >  extern const char *ff_opencl_source_overlay;

> >  extern const char *ff_opencl_source_tonemap;

> >  extern const char *ff_opencl_source_transpose;

> > diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c

> > new file mode 100644

> > index 0000000000..0cc9af10da

> > --- /dev/null

> > +++ b/libavfilter/vf_nlmeans_opencl.c

> > @@ -0,0 +1,390 @@

> > +/*

> > + * This file is part of FFmpeg.

> > + *

> > + * FFmpeg is free software; you can redistribute it and/or

> > + * modify it under the terms of the GNU Lesser General Public

> > + * License as published by the Free Software Foundation; either

> > + * version 2.1 of the License, or (at your option) any later version.

> > + *

> > + * FFmpeg is distributed in the hope that it will be useful,

> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of

> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

> > + * Lesser General Public License for more details.

> > + *

> > + * You should have received a copy of the GNU Lesser General Public

> > + * License along with FFmpeg; if not, write to the Free Software

> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301

> USA

> > + */

> > +#include <float.h>

> > +

> > +#include "libavutil/avassert.h"

> > +#include "libavutil/common.h"

> > +#include "libavutil/imgutils.h"

> > +#include "libavutil/mem.h"

> > +#include "libavutil/opt.h"

> > +#include "libavutil/pixdesc.h"

> > +

> > +#include "avfilter.h"

> > +#include "internal.h"

> > +#include "opencl.h"

> > +#include "opencl_source.h"

> > +#include "video.h"

> > +

> > +static const enum AVPixelFormat supported_formats[] = {

> > +    AV_PIX_FMT_YUV420P,

> > +    AV_PIX_FMT_YUV444P,

> > +    AV_PIX_FMT_GBRP,

> 

> YUV420P16, YUV444P16, GBRP16 probably also work with no change?

I am accumulating against the integer value of the pixels, it need to update the float to integer multiplier used in OpenCL kernel.

> 

> > +};

> > +

> > +static int is_format_supported(enum AVPixelFormat fmt)

> > +{

> > +    int i;

> > +

> > +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)

> > +        if (supported_formats[i] == fmt)

> > +            return 1;

> > +    return 0;

> > +}

> > +

> > +typedef struct NLMeansOpenCLContext {

> > +    OpenCLFilterContext   ocf;

> > +    int                   initialised;

> > +    cl_kernel             vert_kernel;

> > +    cl_kernel             horiz_kernel;

> > +    cl_kernel             accum_kernel;

> > +    cl_kernel             average_kernel;

> > +    double                sigma;

> > +    float                 h;

> > +    int                   chroma_w;

> > +    int                   chroma_h;

> > +    int                   patch_size;

> > +    int                   patch_size_uv;

> > +    int                   research_size;

> > +    int                   research_size_uv;

> > +    cl_command_queue      command_queue;

> > +} NLMeansOpenCLContext;

> > +

> > +static int nlmeans_opencl_init(AVFilterContext *avctx)

> > +{

> > +    NLMeansOpenCLContext *ctx = avctx->priv;

> > +    cl_int cle;

> > +    int err;

> > +

> > +    ctx->h = ctx->sigma * 10;

> > +    if (!ctx->research_size_uv)

> > +        ctx->research_size_uv = ctx->research_size;

> > +    if (!ctx->patch_size_uv)

> > +        ctx->patch_size_uv = ctx->patch_size;

> > +

> > +    err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_nlmeans,

> 1);

> > +    if (err < 0)

> > +        goto fail;

> > +

> > +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx-

> >context,

> > +                                              ctx->ocf.hwctx->device_id,

> > +                                              0, &cle);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "

> > +                     "command queue %d.\n", cle);

> > +

> > +    ctx->vert_kernel = clCreateKernel(ctx->ocf.program, "vert_sum", &cle);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create vert_sum

> kernel %d.\n", cle);

> > +

> > +    ctx->horiz_kernel = clCreateKernel(ctx->ocf.program, "horiz_sum", &cle);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create horiz_sum

> kernel %d.\n", cle);

> > +

> > +    ctx->accum_kernel = clCreateKernel(ctx->ocf.program, "weight_accum",

> &cle);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create accum kernel %d.\n",

> cle);

> > +

> > +    ctx->average_kernel = clCreateKernel(ctx->ocf.program, "average", &cle);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create average

> kernel %d.\n", cle);

> > +

> > +    ctx->initialised = 1;

> > +    return 0;

> > +

> > +fail:

> > +    if (ctx->command_queue)

> > +        clReleaseCommandQueue(ctx->command_queue);

> > +    if (ctx->vert_kernel)

> > +        clReleaseKernel(ctx->vert_kernel);

> > +    if (ctx->horiz_kernel)

> > +        clReleaseKernel(ctx->horiz_kernel);

> > +    if (ctx->accum_kernel)

> > +        clReleaseKernel(ctx->accum_kernel);

> > +    if (ctx->average_kernel)

> > +        clReleaseKernel(ctx->average_kernel);

> > +    return err;

> > +}

> > +

> > +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,

> > +                         int w, int h, int p, int r)

> > +{

> > +    NLMeansOpenCLContext *ctx = avctx->priv;

> > +    const float zero = 0.0f;

> > +    const size_t worksize1[] = {h};

> > +    const size_t worksize2[] = {w};

> > +    const size_t worksize3[2] = {w, h};

> > +    int dx, dy, err = 0, weight_buf_size;

> > +    cl_mem ii, weight, sum;

> > +    cl_int cle;

> > +    int nb_pixel, *tmp, *dxdy, idx = 0;

> 

> I think some of these should be cl_int since they are going to be used on the

> device side.

Will fix it.
> 

> > +

> > +    weight_buf_size = w * h * sizeof(int);

> 

> sizeof(cl_int)

yes
> 

> > +    ii = clCreateBuffer(ctx->ocf.hwctx->context, 0, 4 * weight_buf_size,

> > +                               NULL, &cle);

> > +    weight = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,

> > +                               NULL, &cle);

> > +    sum = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,

> > +                               NULL, &cle);

> 

> These allocations are unchecked.

Will fix it
> 

> The sizes shouldn't vary - is there any benefit to allocating the buffers once and

> them keeping them across iterations?  (Maybe it's swamped by other operations

> being slow anyway.)

Sounds good idea, will make the buffer globally shared.
> 

> > +    cle = clEnqueueFillBuffer(ctx->command_queue, weight, &zero,

> sizeof(float),

> > +                              0, weight_buf_size, 0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",

> > +                     cle);

> > +    cle = clEnqueueFillBuffer(ctx->command_queue, sum, &zero, sizeof(float),

> > +                              0, weight_buf_size, 0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",

> > +                     cle);

> > +

> > +    nb_pixel = (2*r+1) *(2*r+1)-1;

> 

> Spaces around binary operators, please.  (Also in some places below.)

Will fix it.
> 

> > +    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));

> > +    tmp = av_malloc(nb_pixel * 2 * sizeof(int));

> > +

> > +    if (!dxdy || !tmp)

> > +        goto fail;

> > +

> > +    for (dx = -r; dx <= r; dx++) {

> > +        for (dy = -r; dy <= r; dy++) {

> > +            if (dx || dy) {

> > +                tmp[idx++] = dx;

> > +                tmp[idx++] = dy;

> > +            }

> > +        }

> > +    }

> > +    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch

> > +    for (int i = 0; i < nb_pixel/4;i++) {

> > +        dxdy[i * 8] = tmp[i * 8];         // dx0

> > +        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1

> > +        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2

> > +        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3

> > +        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0

> > +        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1

> > +        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2

> > +        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3

> > +    }

> > +    av_freep(&tmp);

> > +

> > +    for (int i = 0; i < nb_pixel / 4; i++) {

> > +        int *dx_cur = dxdy + 8 * i;

> > +        int *dy_cur = dxdy + 8 * i + 4;

> > +

> > +        // ii(x,y) = sum of [u(i,y) - u(i+dx,y+dy)]^2 for all i < x

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ii);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &w);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &h);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);

> > +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel,

> 1,

> > +                               NULL, worksize1, NULL, 0, NULL, NULL);

> > +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue

> horiz_kernel: %d.\n",

> > +                         cle);

> > +

> > +        // ii(x,y) = ii(x,0) + ii(x,1) +...+ ii(x,y-1)

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ii);

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &w);

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &h);

> > +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,

> > +                                     1, NULL, worksize2, NULL, 0, NULL, NULL);

> > +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue

> vert_kernel: %d.\n",

> > +                         cle);

> > +

> > +        // accumlate weights

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &sum);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &weight);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ii);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &w);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &h);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);

> > +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx-

> >accum_kernel,

> > +                                     2, NULL, worksize3, NULL, 0, NULL, NULL);

> > +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n",

> cle);

> > +    }

> > +    av_freep(&dxdy);

> > +

> > +    // average

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &sum);

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &weight);

> > +    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx-

> >average_kernel, 2,

> > +                                 NULL, worksize3, NULL, 0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average

> kernel: %d.\n",

> > +                     cle);

> > +    cle = clFinish(ctx->command_queue);

> 

> Excepting the destruction of the memory objects, is there any reason it needs

> the clFinish() here rather than once all planes have been processed?

No specific reason. Will remove it.
> 

> (More generally, it feels like there should be some more parallelism extractable

> here.)

I am not sure whether you have good idea?
I agree the filter can be further optimized. But it needs more experiments and work.
 
> 

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);

> > +fail:

> > +    if (tmp)

> > +      av_freep(&tmp);

> > +    if (dxdy)

> > +      av_freep(&dxdy);

> > +    clFinish(ctx->command_queue);

> > +    clReleaseMemObject(ii);

> > +    clReleaseMemObject(weight);

> > +    clReleaseMemObject(sum);

> > +    return err;

> > +}

> > +

> > +static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)

> > +{

> > +    AVFilterContext    *avctx = inlink->dst;

> > +    AVFilterLink     *outlink = avctx->outputs[0];

> > +    NLMeansOpenCLContext *ctx = avctx->priv;

> > +    AVFrame *output = NULL;

> > +    AVHWFramesContext *input_frames_ctx;

> > +    const AVPixFmtDescriptor *desc;

> > +    enum AVPixelFormat in_format;

> > +    cl_mem src, dst;

> > +    int w, h, err, p, patch, research;

> > +

> > +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",

> > +           av_get_pix_fmt_name(input->format),

> > +           input->width, input->height, input->pts);

> > +

> > +    if (!input->hw_frames_ctx)

> > +        return AVERROR(EINVAL);

> > +    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;

> > +    in_format = input_frames_ctx->sw_format;

> > +

> > +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);

> > +    if (!output) {

> > +        err = AVERROR(ENOMEM);

> > +        goto fail;

> > +    }

> > +

> > +    err = av_frame_copy_props(output, input);

> > +    if (err < 0)

> > +        goto fail;

> > +

> > +    if (!ctx->initialised) {

> > +        desc = av_pix_fmt_desc_get(in_format);

> > +        if (!is_format_supported(in_format)) {

> > +            err = AVERROR(EINVAL);

> > +            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",

> > +                   av_get_pix_fmt_name(in_format));

> > +            goto fail;

> > +        }

> > +        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);

> > +        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);

> > +

> > +        err = nlmeans_opencl_init(avctx);

> > +        if (err < 0)

> > +            goto fail;

> > +    }

> > +

> > +    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {

> > +        src = (cl_mem) input->data[p];

> > +        dst = (cl_mem) output->data[p];

> > +

> > +        if (!dst)

> > +            break;

> > +        w = p ? ctx->chroma_w : inlink->w;

> > +        h = p ? ctx->chroma_h : inlink->h;

> > +        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;

> > +        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;

> > +        err = nlmeans_plane(avctx, dst, src, w, h, patch, research);

> > +        if (err < 0)

> > +            goto fail;

> > +    }

> > +

> > +    av_frame_free(&input);

> > +

> > +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",

> > +           av_get_pix_fmt_name(output->format),

> > +           output->width, output->height, output->pts);

> > +

> > +    return ff_filter_frame(outlink, output);

> > +

> > +fail:

> > +    clFinish(ctx->command_queue);

> > +    av_frame_free(&input);

> > +    av_frame_free(&output);

> > +    return err;

> > +}

> > +

> > +#define RELEASE_KERNEL(k)                                    \

> > +do {                                                         \

> > +    if (k) {                                                 \

> > +        cle = clReleaseKernel(k);                            \

> > +        if (cle != CL_SUCCESS)                               \

> > +            av_log(avctx, AV_LOG_ERROR, "Failed to release " \

> > +                   "kernel: %d.\n", cle);                    \

> > +    }                                                        \

> > +} while(0)

> 

> This appears multiple times here and also in other filters.  Maybe it should be a

> macro in opencl.h like CL_SET_KERNEL_ARG?

Good idea.
> 

> > +

> > +static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)

> > +{

> > +    NLMeansOpenCLContext *ctx = avctx->priv;

> > +    cl_int cle;

> > +

> > +    RELEASE_KERNEL(ctx->vert_kernel);

> > +    RELEASE_KERNEL(ctx->horiz_kernel);

> > +    RELEASE_KERNEL(ctx->accum_kernel);

> > +    RELEASE_KERNEL(ctx->average_kernel);

> > +

> > +    if (ctx->command_queue) {

> > +        cle = clReleaseCommandQueue(ctx->command_queue);

> > +        if (cle != CL_SUCCESS)

> > +            av_log(avctx, AV_LOG_ERROR, "Failed to release "

> > +                   "command queue: %d.\n", cle);

> > +    }

> > +

> > +    ff_opencl_filter_uninit(avctx);

> > +}

> > +

> > +#define OFFSET(x) offsetof(NLMeansOpenCLContext, x)

> > +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM |

> AV_OPT_FLAG_VIDEO_PARAM)

> > +static const AVOption nlmeans_opencl_options[] = {

> > +    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl

> = 1.0 }, 1.0, 30.0, FLAGS },

> > +    { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT,

> { .i64 = 2*3+1 }, 0, 99, FLAGS },

> > +    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv),

> AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },

> > +    { "r",  "research window",                   OFFSET(research_size),

> AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },

> > +    { "rc", "research window for chroma planes", OFFSET(research_size_uv),

> AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },

> 

> There is an assumption that the size arguments are odd integers, but it isn't

> checked anywhere.

Will add the check.
> 

> > +    { NULL }

> > +};

> > +

> > +AVFILTER_DEFINE_CLASS(nlmeans_opencl);

> > +

> > +static const AVFilterPad nlmeans_opencl_inputs[] = {

> > +    {

> > +        .name         = "default",

> > +        .type         = AVMEDIA_TYPE_VIDEO,

> > +        .filter_frame = &nlmeans_opencl_filter_frame,

> > +        .config_props = &ff_opencl_filter_config_input,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +static const AVFilterPad nlmeans_opencl_outputs[] = {

> > +    {

> > +        .name         = "default",

> > +        .type         = AVMEDIA_TYPE_VIDEO,

> > +        .config_props = &ff_opencl_filter_config_output,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +AVFilter ff_vf_nlmeans_opencl = {

> > +    .name           = "nlmeans_opencl",

> > +    .description    = NULL_IF_CONFIG_SMALL("Non-local means denoiser

> through OpenCL"),

> > +    .priv_size      = sizeof(NLMeansOpenCLContext),

> > +    .priv_class     = &nlmeans_opencl_class,

> > +    .init           = &ff_opencl_filter_init,

> > +    .uninit         = &nlmeans_opencl_uninit,

> > +    .query_formats  = &ff_opencl_filter_query_formats,

> > +    .inputs         = nlmeans_opencl_inputs,

> > +    .outputs        = nlmeans_opencl_outputs,

> > +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,

> > +};

> >

> 

> I got good results compared to CPU nlmeans (1.4x speedup) on the GPU vs. CPU

> of a CFL 8700 with Beignet, even including the upload/download overhead.

> Have you tried it on any larger GPU?  Can it get much faster with more

> processing power, or does something else limit it?

Generally the complexity is O(N^2 * R^2). N is image size, and R is the research window size.
You may notice that you can easily get higher performance with smaller R. but I am not good at tuning it.
I currently only have a my i7-6770HQ at hand, which has a GT4e GPU. it takes 500ms to process one 1080p frame using default parameter.
So basically I think you either need some powerful GPU or tune the research-window.

Thanks!
Ruiling
> 

> Thanks,

> 

> - Mark

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Carl Eugen Hoyos April 9, 2019, 1:20 p.m. UTC | #8
2019-04-09 4:54 GMT+02:00, Song, Ruiling <ruiling.song@intel.com>:

>> > +kernel void vert_sum(__global uint4 *ii,
>> > +                     int width,
>> > +                     int height)
>> > +{
>> > +    int x = get_global_id(0);
>> > +    uint4 sum = 0;
>> > +    for (int i = 0; i < height; i++) {
>> > +        ii[i * width + x] += sum;
>> > +        sum = ii[i * width + x];
>>
>> This looks like it might be able to overflow in extreme cases?
>>
>> 3840 * 2160 * (1 - 0)^2 * 255 * 255 = 539,343,360,000 which
>> is a long way out of range for a 32-bit int.  That requires
>> impossible input (all pixels differing by the most extreme
>> value), but something like a chequerboard might be of the
>> same order?
> Yes this is a dilemma for me. Generally the filter is with
> high computation cost.
> To fix the overflow, we have to use 64bit integer for the
> integral image. Most GPUs are not good at 64bit integer
> calculation I think. May be we can try later.
> So I would prefer to stay with 32bit integer for a while.

Can the overflow be detected at runtime?

Could the user choose between 32 and 64 bit calculation?

Carl Eugen
Ruiling Song April 10, 2019, 8:50 a.m. UTC | #9
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Carl Eugen Hoyos

> Sent: Tuesday, April 9, 2019 9:21 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add nlmeans_opencl filter

> 

> 2019-04-09 4:54 GMT+02:00, Song, Ruiling <ruiling.song@intel.com>:

> 

> >> > +kernel void vert_sum(__global uint4 *ii,

> >> > +                     int width,

> >> > +                     int height)

> >> > +{

> >> > +    int x = get_global_id(0);

> >> > +    uint4 sum = 0;

> >> > +    for (int i = 0; i < height; i++) {

> >> > +        ii[i * width + x] += sum;

> >> > +        sum = ii[i * width + x];

> >>

> >> This looks like it might be able to overflow in extreme cases?

> >>

> >> 3840 * 2160 * (1 - 0)^2 * 255 * 255 = 539,343,360,000 which

> >> is a long way out of range for a 32-bit int.  That requires

> >> impossible input (all pixels differing by the most extreme

> >> value), but something like a chequerboard might be of the

> >> same order?

> > Yes this is a dilemma for me. Generally the filter is with

> > high computation cost.

> > To fix the overflow, we have to use 64bit integer for the

> > integral image. Most GPUs are not good at 64bit integer

> > calculation I think. May be we can try later.

> > So I would prefer to stay with 32bit integer for a while.

> 

> Can the overflow be detected at runtime?

Will add the check.
> 

> Could the user choose between 32 and 64 bit calculation?

I may mark this as TODO.
> 

> Carl Eugen

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Ruiling Song April 12, 2019, 7:38 a.m. UTC | #10
> > > +#define RELEASE_KERNEL(k)                                    \

> > > +do {                                                         \

> > > +    if (k) {                                                 \

> > > +        cle = clReleaseKernel(k);                            \

> > > +        if (cle != CL_SUCCESS)                               \

> > > +            av_log(avctx, AV_LOG_ERROR, "Failed to release " \

> > > +                   "kernel: %d.\n", cle);                    \

> > > +    }                                                        \

> > > +} while(0)

> >

> > This appears multiple times here and also in other filters.  Maybe it should be a

> > macro in opencl.h like CL_SET_KERNEL_ARG?

Hi Mark,

I am rethinking about this problem, can we just simply call clReleaseKernel() and not checking the input and the error_code.
OpenCL spec has require implementation to check the input argument. So I think we can just ignore the if-null check.
As we are destroying the objects, is it still useful to care the error code returned?

Thanks!
Ruiling
Mark Thompson April 13, 2019, 5:22 p.m. UTC | #11
On 12/04/2019 08:38, Song, Ruiling wrote:
>>>> +#define RELEASE_KERNEL(k)                                    \
>>>> +do {                                                         \
>>>> +    if (k) {                                                 \
>>>> +        cle = clReleaseKernel(k);                            \
>>>> +        if (cle != CL_SUCCESS)                               \
>>>> +            av_log(avctx, AV_LOG_ERROR, "Failed to release " \
>>>> +                   "kernel: %d.\n", cle);                    \
>>>> +    }                                                        \
>>>> +} while(0)
>>>
>>> This appears multiple times here and also in other filters.  Maybe it should be a
>>> macro in opencl.h like CL_SET_KERNEL_ARG?
> Hi Mark,
> 
> I am rethinking about this problem, can we just simply call clReleaseKernel() and not checking the input and the error_code.
> OpenCL spec has require implementation to check the input argument. So I think we can just ignore the if-null check.

I'm not sure that's true?  The spec allows a CL_INVALID_KERNEL error, but doesn't offer any clear indication of when it should be returned (NULL is distinguished in other cases, but not here).  Random pointers certainly do crash implementations, so they aren't interpreting it as a requirement to validate the pointer generally (against some list in the context, say).

The standard ICD loader does have a null check returning CL_INVALID_KERNEL, but there is no requirement that it is used rather than linking to a particular ICD directly.

> As we are destroying the objects, is it still useful to care the error code returned?

Probably not, I agree.

- Mark
Ruiling Song April 14, 2019, 11:32 a.m. UTC | #12
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Mark Thompson

> Sent: Sunday, April 14, 2019 1:23 AM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add nlmeans_opencl filter

> 

> On 12/04/2019 08:38, Song, Ruiling wrote:

> >>>> +#define RELEASE_KERNEL(k)                                    \

> >>>> +do {                                                         \

> >>>> +    if (k) {                                                 \

> >>>> +        cle = clReleaseKernel(k);                            \

> >>>> +        if (cle != CL_SUCCESS)                               \

> >>>> +            av_log(avctx, AV_LOG_ERROR, "Failed to release " \

> >>>> +                   "kernel: %d.\n", cle);                    \

> >>>> +    }                                                        \

> >>>> +} while(0)

> >>>

> >>> This appears multiple times here and also in other filters.  Maybe it should

> be a

> >>> macro in opencl.h like CL_SET_KERNEL_ARG?

> > Hi Mark,

> >

> > I am rethinking about this problem, can we just simply call clReleaseKernel()

> and not checking the input and the error_code.

> > OpenCL spec has require implementation to check the input argument. So I

> think we can just ignore the if-null check.

> 

> I'm not sure that's true?  The spec allows a CL_INVALID_KERNEL error, but

> doesn't offer any clear indication of when it should be returned (NULL is

> distinguished in other cases, but not here).  Random pointers certainly do crash

> implementations, so they aren't interpreting it as a requirement to validate the

> pointer generally (against some list in the context, say).

Yes, seems the spec does not say about null pointer check clearly.
Because the null pointer check is cheap, so I thought every good programmed OpenCL driver should be able to check that.
Maybe you are right. I am not quite sure now:(
So we can keep the check as before. I have added the macro to do this. Please help take a look at V2 when you have time.

Thanks!
Ruiling
> 

> The standard ICD loader does have a null check returning CL_INVALID_KERNEL,

> but there is no requirement that it is used rather than linking to a particular ICD

> directly.

> 

> > As we are destroying the objects, is it still useful to care the error code

> returned?

> 

> Probably not, I agree.

> 

> - Mark

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/configure b/configure
index f6123f53e5..a233512491 100755
--- a/configure
+++ b/configure
@@ -3460,6 +3460,7 @@  mpdecimate_filter_select="pixelutils"
 minterpolate_filter_select="scene_sad"
 mptestsrc_filter_deps="gpl"
 negate_filter_deps="lut_filter"
+nlmeans_opencl_filter_deps="opencl"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
diff --git a/doc/filters.texi b/doc/filters.texi
index 867607d870..21c2c1a4b5 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -19030,6 +19030,10 @@  Apply erosion filter with threshold0 set to 30, threshold1 set 40, threshold2 se
 @end example
 @end itemize
 
+@section nlmeans_opencl
+
+Non-local Means denoise filter through OpenCL, this filter accepts same options as @ref{nlmeans}.
+
 @section overlay_opencl
 
 Overlay one video on top of another.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index fef6ec5c55..92039bfdcf 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -291,6 +291,7 @@  OBJS-$(CONFIG_MIX_FILTER)                    += vf_mix.o
 OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
+OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
 OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c51ae0f3c7..2a6390c92d 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -277,6 +277,7 @@  extern AVFilter ff_vf_mix;
 extern AVFilter ff_vf_mpdecimate;
 extern AVFilter ff_vf_negate;
 extern AVFilter ff_vf_nlmeans;
+extern AVFilter ff_vf_nlmeans_opencl;
 extern AVFilter ff_vf_nnedi;
 extern AVFilter ff_vf_noformat;
 extern AVFilter ff_vf_noise;
diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
new file mode 100644
index 0000000000..dcb04834ca
--- /dev/null
+++ b/libavfilter/opencl/nlmeans.cl
@@ -0,0 +1,108 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                           CLK_ADDRESS_CLAMP_TO_EDGE   |
+                           CLK_FILTER_NEAREST);
+
+kernel void horiz_sum(__global uint4 *ii,
+                      __read_only image2d_t src,
+                      int width,
+                      int height,
+                      int4 dx,
+                      int4 dy)
+{
+
+    int y = get_global_id(0);
+    int work_size = get_global_size(0);
+
+    uint4 sum = (uint4)(0);
+    float4 s2;
+    for (int i = 0; i < width; i++) {
+        float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
+        s2.x = read_imagef(src, sampler, (int2)(i+dx.x, y+dy.x)).x;
+        s2.y = read_imagef(src, sampler, (int2)(i+dx.y, y+dy.y)).x;
+        s2.z = read_imagef(src, sampler, (int2)(i+dx.z, y+dy.z)).x;
+        s2.w = read_imagef(src, sampler, (int2)(i+dx.w, y+dy.w)).x;
+        sum += convert_uint4((s1-s2)*(s1-s2) * 255*255);
+        ii[y * width + i] = sum;
+    }
+}
+
+kernel void vert_sum(__global uint4 *ii,
+                     int width,
+                     int height)
+{
+    int x = get_global_id(0);
+    uint4 sum = 0;
+    for (int i = 0; i < height; i++) {
+        ii[i * width + x] += sum;
+        sum = ii[i * width + x];
+    }
+}
+
+kernel void weight_accum(global float *sum, global float *weight,
+                         global uint4 *ii, __read_only image2d_t src,
+                         int width, int height, int p, float h,
+                         int4 dx, int4 dy)
+{
+    // w(x) = ii(x-p, y-p) + ii(x+p, y+p) - ii(x+p, y-p) - ii(x-p, y+p)
+    // total_sum[x] += w(x, y) * src(x+dx, y+dy)
+    // total_weight += w(x, y)
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int4 xoff = x+dx;
+    int4 yoff = y+dy;
+    uint4 a = 0, b = 0, c = 0, d = 0;
+    uint4 src_pix = 0;
+
+    // out-of-bounding-box?
+    int oobb = (x-p) < 0 || (y-p) < 0 || (y+p) >= height || (x+p) >= width;
+
+    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
+    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
+    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
+    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
+    if (!oobb) {
+        a = ii[(y-p) * width + x - p];
+        b = ii[(y + p) * width + x - p];
+        c = ii[(y-p) * width + x + p];
+        d = ii[(y + p) * width + x + p];
+    }
+
+    float4 patch_diff = convert_float4(d + a - c - b);
+    float4 w = native_exp(-patch_diff/(h*h));
+    float w_sum = w.x + w.y + w.z + w.w;
+    weight[y*width + x] += w_sum;
+    sum[y*width + x] += dot(w, convert_float4(src_pix));
+}
+
+kernel void average(__write_only image2d_t dst,
+                    __read_only image2d_t src,
+                    global float *sum, global float *weight) {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int2 dim = get_image_dim(dst);
+
+    float w = weight[y * dim.x + x];
+    float s = sum[y*dim.x + x];
+    float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
+    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
+    if (x < dim.x && y < dim.y)
+        write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 4118138c30..fd40fd7dca 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -23,6 +23,7 @@  extern const char *ff_opencl_source_avgblur;
 extern const char *ff_opencl_source_colorspace_common;
 extern const char *ff_opencl_source_convolution;
 extern const char *ff_opencl_source_neighbor;
+extern const char *ff_opencl_source_nlmeans;
 extern const char *ff_opencl_source_overlay;
 extern const char *ff_opencl_source_tonemap;
 extern const char *ff_opencl_source_transpose;
diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
new file mode 100644
index 0000000000..0cc9af10da
--- /dev/null
+++ b/libavfilter/vf_nlmeans_opencl.c
@@ -0,0 +1,390 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <float.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "video.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+};
+
+static int is_format_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+typedef struct NLMeansOpenCLContext {
+    OpenCLFilterContext   ocf;
+    int                   initialised;
+    cl_kernel             vert_kernel;
+    cl_kernel             horiz_kernel;
+    cl_kernel             accum_kernel;
+    cl_kernel             average_kernel;
+    double                sigma;
+    float                 h;
+    int                   chroma_w;
+    int                   chroma_h;
+    int                   patch_size;
+    int                   patch_size_uv;
+    int                   research_size;
+    int                   research_size_uv;
+    cl_command_queue      command_queue;
+} NLMeansOpenCLContext;
+
+static int nlmeans_opencl_init(AVFilterContext *avctx)
+{
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+    int err;
+
+    ctx->h = ctx->sigma * 10;
+    if (!ctx->research_size_uv)
+        ctx->research_size_uv = ctx->research_size;
+    if (!ctx->patch_size_uv)
+        ctx->patch_size_uv = ctx->patch_size;
+
+    err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_nlmeans, 1);
+    if (err < 0)
+        goto fail;
+
+    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+                                              ctx->ocf.hwctx->device_id,
+                                              0, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
+                     "command queue %d.\n", cle);
+
+    ctx->vert_kernel = clCreateKernel(ctx->ocf.program, "vert_sum", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create vert_sum kernel %d.\n", cle);
+
+    ctx->horiz_kernel = clCreateKernel(ctx->ocf.program, "horiz_sum", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create horiz_sum kernel %d.\n", cle);
+
+    ctx->accum_kernel = clCreateKernel(ctx->ocf.program, "weight_accum", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create accum kernel %d.\n", cle);
+
+    ctx->average_kernel = clCreateKernel(ctx->ocf.program, "average", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create average kernel %d.\n", cle);
+
+    ctx->initialised = 1;
+    return 0;
+
+fail:
+    if (ctx->command_queue)
+        clReleaseCommandQueue(ctx->command_queue);
+    if (ctx->vert_kernel)
+        clReleaseKernel(ctx->vert_kernel);
+    if (ctx->horiz_kernel)
+        clReleaseKernel(ctx->horiz_kernel);
+    if (ctx->accum_kernel)
+        clReleaseKernel(ctx->accum_kernel);
+    if (ctx->average_kernel)
+        clReleaseKernel(ctx->average_kernel);
+    return err;
+}
+
+static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
+                         int w, int h, int p, int r)
+{
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    const float zero = 0.0f;
+    const size_t worksize1[] = {h};
+    const size_t worksize2[] = {w};
+    const size_t worksize3[2] = {w, h};
+    int dx, dy, err = 0, weight_buf_size;
+    cl_mem ii, weight, sum;
+    cl_int cle;
+    int nb_pixel, *tmp, *dxdy, idx = 0;
+
+    weight_buf_size = w * h * sizeof(int);
+    ii = clCreateBuffer(ctx->ocf.hwctx->context, 0, 4 * weight_buf_size,
+                               NULL, &cle);
+    weight = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,
+                               NULL, &cle);
+    sum = clCreateBuffer(ctx->ocf.hwctx->context, 0, weight_buf_size,
+                               NULL, &cle);
+    cle = clEnqueueFillBuffer(ctx->command_queue, weight, &zero, sizeof(float),
+                              0, weight_buf_size, 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
+                     cle);
+    cle = clEnqueueFillBuffer(ctx->command_queue, sum, &zero, sizeof(float),
+                              0, weight_buf_size, 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
+                     cle);
+
+    nb_pixel = (2*r+1) *(2*r+1)-1;
+    dxdy = av_malloc(nb_pixel * 2 * sizeof(int));
+    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+    if (!dxdy || !tmp)
+        goto fail;
+
+    for (dx = -r; dx <= r; dx++) {
+        for (dy = -r; dy <= r; dy++) {
+            if (dx || dy) {
+                tmp[idx++] = dx;
+                tmp[idx++] = dy;
+            }
+        }
+    }
+    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+    for (int i = 0; i < nb_pixel/4;i++) {
+        dxdy[i * 8] = tmp[i * 8];         // dx0
+        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+    }
+    av_freep(&tmp);
+
+    for (int i = 0; i < nb_pixel / 4; i++) {
+        int *dx_cur = dxdy + 8 * i;
+        int *dy_cur = dxdy + 8 * i + 4;
+
+        // ii(x,y) = sum of [u(i,y) - u(i+dx,y+dy)]^2 for all i < x
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ii);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &w);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &h);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel, 1,
+                               NULL, worksize1, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel: %d.\n",
+                         cle);
+
+        // ii(x,y) = ii(x,0) + ii(x,1) +...+ ii(x,y-1)
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ii);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &w);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &h);
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
+                                     1, NULL, worksize2, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
+                         cle);
+
+        // accumlate weights
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &sum);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &weight);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ii);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &w);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &h);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->accum_kernel,
+                                     2, NULL, worksize3, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
+    }
+    av_freep(&dxdy);
+
+    // average
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &sum);
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &weight);
+    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->average_kernel, 2,
+                                 NULL, worksize3, NULL, 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average kernel: %d.\n",
+                     cle);
+    cle = clFinish(ctx->command_queue);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
+fail:
+    if (tmp)
+      av_freep(&tmp);
+    if (dxdy)
+      av_freep(&dxdy);
+    clFinish(ctx->command_queue);
+    clReleaseMemObject(ii);
+    clReleaseMemObject(weight);
+    clReleaseMemObject(sum);
+    return err;
+}
+
+static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
+{
+    AVFilterContext    *avctx = inlink->dst;
+    AVFilterLink     *outlink = avctx->outputs[0];
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    AVFrame *output = NULL;
+    AVHWFramesContext *input_frames_ctx;
+    const AVPixFmtDescriptor *desc;
+    enum AVPixelFormat in_format;
+    cl_mem src, dst;
+    int w, h, err, p, patch, research;
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(input->format),
+           input->width, input->height, input->pts);
+
+    if (!input->hw_frames_ctx)
+        return AVERROR(EINVAL);
+    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;
+    in_format = input_frames_ctx->sw_format;
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, input);
+    if (err < 0)
+        goto fail;
+
+    if (!ctx->initialised) {
+        desc = av_pix_fmt_desc_get(in_format);
+        if (!is_format_supported(in_format)) {
+            err = AVERROR(EINVAL);
+            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",
+                   av_get_pix_fmt_name(in_format));
+            goto fail;
+        }
+        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
+        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
+
+        err = nlmeans_opencl_init(avctx);
+        if (err < 0)
+            goto fail;
+    }
+
+    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
+        src = (cl_mem) input->data[p];
+        dst = (cl_mem) output->data[p];
+
+        if (!dst)
+            break;
+        w = p ? ctx->chroma_w : inlink->w;
+        h = p ? ctx->chroma_h : inlink->h;
+        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;
+        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;
+        err = nlmeans_plane(avctx, dst, src, w, h, patch, research);
+        if (err < 0)
+            goto fail;
+    }
+
+    av_frame_free(&input);
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(output->format),
+           output->width, output->height, output->pts);
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    clFinish(ctx->command_queue);
+    av_frame_free(&input);
+    av_frame_free(&output);
+    return err;
+}
+
+#define RELEASE_KERNEL(k)                                    \
+do {                                                         \
+    if (k) {                                                 \
+        cle = clReleaseKernel(k);                            \
+        if (cle != CL_SUCCESS)                               \
+            av_log(avctx, AV_LOG_ERROR, "Failed to release " \
+                   "kernel: %d.\n", cle);                    \
+    }                                                        \
+} while(0)
+
+static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
+{
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+
+    RELEASE_KERNEL(ctx->vert_kernel);
+    RELEASE_KERNEL(ctx->horiz_kernel);
+    RELEASE_KERNEL(ctx->accum_kernel);
+    RELEASE_KERNEL(ctx->average_kernel);
+
+    if (ctx->command_queue) {
+        cle = clReleaseCommandQueue(ctx->command_queue);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "command queue: %d.\n", cle);
+    }
+
+    ff_opencl_filter_uninit(avctx);
+}
+
+#define OFFSET(x) offsetof(NLMeansOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption nlmeans_opencl_options[] = {
+    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
+    { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
+    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { "r",  "research window",                   OFFSET(research_size),    AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
+    { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nlmeans_opencl);
+
+static const AVFilterPad nlmeans_opencl_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = &nlmeans_opencl_filter_frame,
+        .config_props = &ff_opencl_filter_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad nlmeans_opencl_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = &ff_opencl_filter_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_nlmeans_opencl = {
+    .name           = "nlmeans_opencl",
+    .description    = NULL_IF_CONFIG_SMALL("Non-local means denoiser through OpenCL"),
+    .priv_size      = sizeof(NLMeansOpenCLContext),
+    .priv_class     = &nlmeans_opencl_class,
+    .init           = &ff_opencl_filter_init,
+    .uninit         = &nlmeans_opencl_uninit,
+    .query_formats  = &ff_opencl_filter_query_formats,
+    .inputs         = nlmeans_opencl_inputs,
+    .outputs        = nlmeans_opencl_outputs,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};