diff mbox series

[FFmpeg-devel] avfilter/vf_nlmeans_opencl: making filter independent of bit depth

Message ID 20210321232223.210494-1-lvella@gmail.com
State New
Headers show
Series [FFmpeg-devel] avfilter/vf_nlmeans_opencl: making filter independent of bit depth | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Lucas Clemente Vella March 21, 2021, 11:22 p.m. UTC
This filter originally quantized OpenCL float images fetchs in 256 levels,
and computed the integral image of squared differences in 32 bit integers.

This had two consequences:
1) it could overflow if the image resolution was big enough (I got overflows
in a 4K video);
2) it dropped precision from bit depths higher than 8 bits.

Now the integral image is computed with float values in range [0, 1], instead
of integers in range [0, 255] (then squared), so there is no longer the risk
of overflow.

And even with the accumulated floating point error over the integral image,
the resulting difference between this float implementation and an experimental
uint64 implementation with 65535 quantization levels is less than 0.08%
on the worst difference (per component), and less than 0.002% on average. For
reference, the smallest variation possible on a 10-bit quantization is 0.098%
of the total intensity. This was tested on a 4K frame from an 10-bit source.

Signed-off-by: Lucas Clemente Vella <lvella@gmail.com>
---
 libavfilter/opencl/nlmeans.cl   | 31 ++++++++++++++----------------
 libavfilter/vf_nlmeans_opencl.c | 34 +++++----------------------------
 2 files changed, 19 insertions(+), 46 deletions(-)

Comments

Lucas Clemente Vella March 24, 2021, 6:51 p.m. UTC | #1
Hello.

Is there something I should be doing in order to get this patch
reviewed for inclusion in master? I couldn't find the proper
maintainer for this filter in the MAINTAINERS file.
Mark Thompson April 3, 2021, 5:31 p.m. UTC | #2
On 21/03/2021 23:22, Lucas Clemente Vella wrote:
> This filter originally quantized OpenCL float images fetchs in 256 levels,
> and computed the integral image of squared differences in 32 bit integers.
> 
> This had two consequences:
> 1) it could overflow if the image resolution was big enough (I got overflows
> in a 4K video);
> 2) it dropped precision from bit depths higher than 8 bits.
> 
> Now the integral image is computed with float values in range [0, 1], instead
> of integers in range [0, 255] (then squared), so there is no longer the risk
> of overflow.
> 
> And even with the accumulated floating point error over the integral image,
> the resulting difference between this float implementation and an experimental
> uint64 implementation with 65535 quantization levels is less than 0.08%
> on the worst difference (per component), and less than 0.002% on average. For
> reference, the smallest variation possible on a 10-bit quantization is 0.098%
> of the total intensity. This was tested on a 4K frame from an 10-bit source.

Does the sum actually avoid floating-point error beyond simple rounding?

Given only 23 bits of precision, it looks like there is a danger of losing things by adding a set of small numbers to a large number and the large number being unchanged.

> 
> Signed-off-by: Lucas Clemente Vella <lvella@gmail.com>
> ---
>   libavfilter/opencl/nlmeans.cl   | 31 ++++++++++++++----------------
>   libavfilter/vf_nlmeans_opencl.c | 34 +++++----------------------------
>   2 files changed, 19 insertions(+), 46 deletions(-)
> 
> diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
> index 72bd681fd6..6d78a41e46 100644
> --- a/libavfilter/opencl/nlmeans.cl
> +++ b/libavfilter/opencl/nlmeans.cl
> @@ -20,7 +20,7 @@ const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
>                              CLK_ADDRESS_CLAMP_TO_EDGE   |
>                              CLK_FILTER_NEAREST);
>   
> -kernel void horiz_sum(__global uint4 *integral_img,
> +kernel void horiz_sum(__global float4 *integral_img,
>                         __read_only image2d_t src,
>                         int width,
>                         int height,
> @@ -31,7 +31,7 @@ kernel void horiz_sum(__global uint4 *integral_img,
>       int y = get_global_id(0);
>       int work_size = get_global_size(0);
>   
> -    uint4 sum = (uint4)(0);
> +    float4 sum = 0.0;

Surprise double.

>       float4 s2;
>       for (int i = 0; i < width; i++) {
>           float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
> @@ -39,28 +39,25 @@ kernel void horiz_sum(__global uint4 *integral_img,
>           s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
>           s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
>           s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
> -        sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
> +        sum += (s1 - s2) * (s1 - s2);
>           integral_img[y * width + i] = sum;
>       }
>   }
>   
> -kernel void vert_sum(__global uint4 *integral_img,
> -                     __global int *overflow,
> +kernel void vert_sum(__global float4 *integral_img,
>                        int width,
>                        int height)
>   {
>       int x = get_global_id(0);
> -    uint4 sum = 0;
> +    float4 sum = 0;

Please write floats as floats (0.0f), even if the implicit conversion is correct.

>       for (int i = 0; i < height; i++) {
> -        if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
> -            atomic_inc(overflow);
>           integral_img[i * width + x] += sum;
>           sum = integral_img[i * width + x];
>       }
>   }
>   
>   kernel void weight_accum(global float *sum, global float *weight,
> -                         global uint4 *integral_img, __read_only image2d_t src,
> +                         global float4 *integral_img, __read_only image2d_t src,
>                            int width, int height, int p, float h,
>                            int4 dx, int4 dy)
>   {
> @@ -75,16 +72,16 @@ kernel void weight_accum(global float *sum, global float *weight,
>       int y = get_global_id(1);
>       int4 xoff = x + dx;
>       int4 yoff = y + dy;
> -    uint4 a = 0, b = 0, c = 0, d = 0;
> -    uint4 src_pix = 0;
> +    float4 a = 0, b = 0, c = 0, d = 0;
> +    float4 src_pix = 0;
>   
>       // out-of-bounding-box?
>       int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
>   
> -    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
> -    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
> -    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
> -    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
> +    src_pix.x = read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x;
> +    src_pix.y = read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x;
> +    src_pix.z = read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x;
> +    src_pix.w = read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x;
>       if (!oobb) {
>           a = integral_img[(y - p) * width + x - p];
>           b = integral_img[(y + p) * width + x - p];
> @@ -93,7 +90,7 @@ kernel void weight_accum(global float *sum, global float *weight,
>       }
>   
>       float4 patch_diff = convert_float4(d + a - c - b);
> -    float4 w = native_exp(-patch_diff / (h * h));
> +    float4 w = native_exp(-patch_diff * (float4)(255.0f * 255.0f) / (h * h));
>       float w_sum = w.x + w.y + w.z + w.w;
>       weight[y * width + x] += w_sum;
>       sum[y * width + x] += dot(w, convert_float4(src_pix));
> @@ -109,7 +106,7 @@ kernel void average(__write_only image2d_t dst,
>       float w = weight[y * dim.x + x];
>       float s = sum[y * dim.x + x];
>       float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
> -    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
> +    float r = (s + src_pix) / (1.0f + w);
>       if (x < dim.x && y < dim.y)
>           write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
>   }
> diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
> index e57b5e0873..0b69f3b6c4 100644
> --- a/libavfilter/vf_nlmeans_opencl.c
> +++ b/libavfilter/vf_nlmeans_opencl.c
> @@ -30,11 +30,9 @@
>   #include "opencl_source.h"
>   #include "video.h"
>   
> -// TODO:
> -//      the integral image may overflow 32bit, consider using 64bit
> -
>   static const enum AVPixelFormat supported_formats[] = {
>       AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_YUV420P16LE,
>       AV_PIX_FMT_YUV444P,

Is this the only format it has added support for?  Feels like it could do others which in the MSBs.

>       AV_PIX_FMT_GBRP,
>   };
> @@ -59,7 +57,6 @@ typedef struct NLMeansOpenCLContext {
>       cl_mem                integral_img;
>       cl_mem                weight;
>       cl_mem                sum;
> -    cl_mem                overflow; // overflow in integral image?
>       double                sigma;
>       float                 h;
>       int                   chroma_w;
> @@ -129,7 +126,7 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
>                        "average kernel %d.\n", cle);
>   
>       ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
> -                                       4 * width * height * sizeof(cl_int),
> +                                       4 * width * height * sizeof(cl_float),
>                                          NULL, &cle);
>       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
>                        "integral image %d.\n", cle);
> @@ -144,11 +141,6 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
>       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
>                        "sum buffer %d.\n", cle);
>   
> -    ctx->overflow = clCreateBuffer(ctx->ocf.hwctx->context, 0,
> -                                   sizeof(cl_int), NULL, &cle);
> -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
> -                     "overflow buffer %d.\n", cle);
> -
>       ctx->initialised = 1;
>       return 0;
>   
> @@ -161,7 +153,6 @@ fail:
>       CL_RELEASE_MEMORY(ctx->integral_img);
>       CL_RELEASE_MEMORY(ctx->weight);
>       CL_RELEASE_MEMORY(ctx->sum);
> -    CL_RELEASE_MEMORY(ctx->overflow);
>   
>       CL_RELEASE_QUEUE(ctx->command_queue);
>       return err;
> @@ -239,9 +230,8 @@ static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
>           // vertical pass
>           // integral(x, y) = sum(integral(x, v)) for v in [0, y]
>           CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
> -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
> -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
> -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &width);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &height);

Not affecting this patch, but a side thought: are these arguments actually needed?  They are used as the work sizes, so might be accessible inside the kernel already.

>           cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
>                                        1, NULL, worksize2, NULL, 0, NULL, NULL);
>           CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
> @@ -293,8 +283,7 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>       const AVPixFmtDescriptor *desc;
>       enum AVPixelFormat in_format;
>       cl_mem src, dst;
> -    const cl_int zero = 0;
> -    int w, h, err, cle, overflow, p, patch, research;
> +    int w, h, err, cle, p, patch, research;
>   
>       av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
>              av_get_pix_fmt_name(input->format),
> @@ -331,11 +320,6 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>               goto fail;
>       }
>   
> -    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> -                               0, sizeof(cl_int), &zero, 0, NULL, NULL);
> -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
> -                     "detection buffer %d.\n", cle);
> -
>       for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
>           src = (cl_mem) input->data[p];
>           dst = (cl_mem) output->data[p];
> @@ -351,17 +335,10 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>           if (err < 0)
>               goto fail;
>       }
> -    // overflow occurred?
> -    cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> -                              0, sizeof(cl_int), &overflow, 0, NULL, NULL);
> -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
>   
>       cle = clFinish(ctx->command_queue);
>       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
>   
> -    if (overflow > 0)
> -        av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
> -
>       av_frame_free(&input);
>   
>       av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> @@ -390,7 +367,6 @@ static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
>       CL_RELEASE_MEMORY(ctx->integral_img);
>       CL_RELEASE_MEMORY(ctx->weight);
>       CL_RELEASE_MEMORY(ctx->sum);
> -    CL_RELEASE_MEMORY(ctx->overflow);
>   
>       CL_RELEASE_QUEUE(ctx->command_queue);
>   
> 

Thanks,

- Mark
Lucas Clemente Vella April 3, 2021, 7:59 p.m. UTC | #3
Em sáb., 3 de abr. de 2021 às 19:26, Mark Thompson <sw@jkqxz.net> escreveu:
>
> On 21/03/2021 23:22, Lucas Clemente Vella wrote:
> > This filter originally quantized OpenCL float images fetchs in 256 levels,
> > and computed the integral image of squared differences in 32 bit integers.
> >
> > This had two consequences:
> > 1) it could overflow if the image resolution was big enough (I got overflows
> > in a 4K video);
> > 2) it dropped precision from bit depths higher than 8 bits.
> >
> > Now the integral image is computed with float values in range [0, 1], instead
> > of integers in range [0, 255] (then squared), so there is no longer the risk
> > of overflow.
> >
> > And even with the accumulated floating point error over the integral image,
> > the resulting difference between this float implementation and an experimental
> > uint64 implementation with 65535 quantization levels is less than 0.08%
> > on the worst difference (per component), and less than 0.002% on average. For
> > reference, the smallest variation possible on a 10-bit quantization is 0.098%
> > of the total intensity. This was tested on a 4K frame from an 10-bit source.
>
> Does the sum actually avoid floating-point error beyond simple rounding?
>
> Given only 23 bits of precision, it looks like there is a danger of losing things by adding a set of small numbers to a large number and the large number being unchanged.

I think it might, but I think the risk of affecting visibly the final
image is too small
given the performance benefit of this float4 version compared to the
ulong4 version
(my other patch). The image integral will still be a good enough approximation,
and the float4 version is twice as fast as the ulong4 version.

> >
> > Signed-off-by: Lucas Clemente Vella <lvella@gmail.com>
> > ---
> >   libavfilter/opencl/nlmeans.cl   | 31 ++++++++++++++----------------
> >   libavfilter/vf_nlmeans_opencl.c | 34 +++++----------------------------
> >   2 files changed, 19 insertions(+), 46 deletions(-)
> >
> > diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
> > index 72bd681fd6..6d78a41e46 100644
> > --- a/libavfilter/opencl/nlmeans.cl
> > +++ b/libavfilter/opencl/nlmeans.cl
> > @@ -20,7 +20,7 @@ const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> >                              CLK_ADDRESS_CLAMP_TO_EDGE   |
> >                              CLK_FILTER_NEAREST);
> >
> > -kernel void horiz_sum(__global uint4 *integral_img,
> > +kernel void horiz_sum(__global float4 *integral_img,
> >                         __read_only image2d_t src,
> >                         int width,
> >                         int height,
> > @@ -31,7 +31,7 @@ kernel void horiz_sum(__global uint4 *integral_img,
> >       int y = get_global_id(0);
> >       int work_size = get_global_size(0);
> >
> > -    uint4 sum = (uint4)(0);
> > +    float4 sum = 0.0;
>
> Surprise double.
>
> >       float4 s2;
> >       for (int i = 0; i < width; i++) {
> >           float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
> > @@ -39,28 +39,25 @@ kernel void horiz_sum(__global uint4 *integral_img,
> >           s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
> >           s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
> >           s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
> > -        sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
> > +        sum += (s1 - s2) * (s1 - s2);
> >           integral_img[y * width + i] = sum;
> >       }
> >   }
> >
> > -kernel void vert_sum(__global uint4 *integral_img,
> > -                     __global int *overflow,
> > +kernel void vert_sum(__global float4 *integral_img,
> >                        int width,
> >                        int height)
> >   {
> >       int x = get_global_id(0);
> > -    uint4 sum = 0;
> > +    float4 sum = 0;
>
> Please write floats as floats (0.0f), even if the implicit conversion is correct.
>
> >       for (int i = 0; i < height; i++) {
> > -        if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
> > -            atomic_inc(overflow);
> >           integral_img[i * width + x] += sum;
> >           sum = integral_img[i * width + x];
> >       }
> >   }
> >
> >   kernel void weight_accum(global float *sum, global float *weight,
> > -                         global uint4 *integral_img, __read_only image2d_t src,
> > +                         global float4 *integral_img, __read_only image2d_t src,
> >                            int width, int height, int p, float h,
> >                            int4 dx, int4 dy)
> >   {
> > @@ -75,16 +72,16 @@ kernel void weight_accum(global float *sum, global float *weight,
> >       int y = get_global_id(1);
> >       int4 xoff = x + dx;
> >       int4 yoff = y + dy;
> > -    uint4 a = 0, b = 0, c = 0, d = 0;
> > -    uint4 src_pix = 0;
> > +    float4 a = 0, b = 0, c = 0, d = 0;
> > +    float4 src_pix = 0;
> >
> >       // out-of-bounding-box?
> >       int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
> >
> > -    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
> > -    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
> > -    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
> > -    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
> > +    src_pix.x = read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x;
> > +    src_pix.y = read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x;
> > +    src_pix.z = read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x;
> > +    src_pix.w = read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x;
> >       if (!oobb) {
> >           a = integral_img[(y - p) * width + x - p];
> >           b = integral_img[(y + p) * width + x - p];
> > @@ -93,7 +90,7 @@ kernel void weight_accum(global float *sum, global float *weight,
> >       }
> >
> >       float4 patch_diff = convert_float4(d + a - c - b);
> > -    float4 w = native_exp(-patch_diff / (h * h));
> > +    float4 w = native_exp(-patch_diff * (float4)(255.0f * 255.0f) / (h * h));
> >       float w_sum = w.x + w.y + w.z + w.w;
> >       weight[y * width + x] += w_sum;
> >       sum[y * width + x] += dot(w, convert_float4(src_pix));
> > @@ -109,7 +106,7 @@ kernel void average(__write_only image2d_t dst,
> >       float w = weight[y * dim.x + x];
> >       float s = sum[y * dim.x + x];
> >       float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
> > -    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
> > +    float r = (s + src_pix) / (1.0f + w);
> >       if (x < dim.x && y < dim.y)
> >           write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
> >   }
> > diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
> > index e57b5e0873..0b69f3b6c4 100644
> > --- a/libavfilter/vf_nlmeans_opencl.c
> > +++ b/libavfilter/vf_nlmeans_opencl.c
> > @@ -30,11 +30,9 @@
> >   #include "opencl_source.h"
> >   #include "video.h"
> >
> > -// TODO:
> > -//      the integral image may overflow 32bit, consider using 64bit
> > -
> >   static const enum AVPixelFormat supported_formats[] = {
> >       AV_PIX_FMT_YUV420P,
> > +    AV_PIX_FMT_YUV420P16LE,
> >       AV_PIX_FMT_YUV444P,
>
> Is this the only format it has added support for?  Feels like it could do others which in the MSBs.

I think it will work with any format supported by OpenCL whose
individual planes can be read
by "read_imagef(...).x". I don't know what formats are those. Can you point me?

> >       AV_PIX_FMT_GBRP,
> >   };
> > @@ -59,7 +57,6 @@ typedef struct NLMeansOpenCLContext {
> >       cl_mem                integral_img;
> >       cl_mem                weight;
> >       cl_mem                sum;
> > -    cl_mem                overflow; // overflow in integral image?
> >       double                sigma;
> >       float                 h;
> >       int                   chroma_w;
> > @@ -129,7 +126,7 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
> >                        "average kernel %d.\n", cle);
> >
> >       ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
> > -                                       4 * width * height * sizeof(cl_int),
> > +                                       4 * width * height * sizeof(cl_float),
> >                                          NULL, &cle);
> >       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
> >                        "integral image %d.\n", cle);
> > @@ -144,11 +141,6 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
> >       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
> >                        "sum buffer %d.\n", cle);
> >
> > -    ctx->overflow = clCreateBuffer(ctx->ocf.hwctx->context, 0,
> > -                                   sizeof(cl_int), NULL, &cle);
> > -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
> > -                     "overflow buffer %d.\n", cle);
> > -
> >       ctx->initialised = 1;
> >       return 0;
> >
> > @@ -161,7 +153,6 @@ fail:
> >       CL_RELEASE_MEMORY(ctx->integral_img);
> >       CL_RELEASE_MEMORY(ctx->weight);
> >       CL_RELEASE_MEMORY(ctx->sum);
> > -    CL_RELEASE_MEMORY(ctx->overflow);
> >
> >       CL_RELEASE_QUEUE(ctx->command_queue);
> >       return err;
> > @@ -239,9 +230,8 @@ static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
> >           // vertical pass
> >           // integral(x, y) = sum(integral(x, v)) for v in [0, y]
> >           CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
> > -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
> > -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
> > -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &width);
> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &height);
>
> Not affecting this patch, but a side thought: are these arguments actually needed?  They are used as the work sizes, so might be accessible inside the kernel already.
>
> >           cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
> >                                        1, NULL, worksize2, NULL, 0, NULL, NULL);
> >           CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
> > @@ -293,8 +283,7 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> >       const AVPixFmtDescriptor *desc;
> >       enum AVPixelFormat in_format;
> >       cl_mem src, dst;
> > -    const cl_int zero = 0;
> > -    int w, h, err, cle, overflow, p, patch, research;
> > +    int w, h, err, cle, p, patch, research;
> >
> >       av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> >              av_get_pix_fmt_name(input->format),
> > @@ -331,11 +320,6 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> >               goto fail;
> >       }
> >
> > -    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> > -                               0, sizeof(cl_int), &zero, 0, NULL, NULL);
> > -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
> > -                     "detection buffer %d.\n", cle);
> > -
> >       for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
> >           src = (cl_mem) input->data[p];
> >           dst = (cl_mem) output->data[p];
> > @@ -351,17 +335,10 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> >           if (err < 0)
> >               goto fail;
> >       }
> > -    // overflow occurred?
> > -    cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> > -                              0, sizeof(cl_int), &overflow, 0, NULL, NULL);
> > -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
> >
> >       cle = clFinish(ctx->command_queue);
> >       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
> >
> > -    if (overflow > 0)
> > -        av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
> > -
> >       av_frame_free(&input);
> >
> >       av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> > @@ -390,7 +367,6 @@ static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
> >       CL_RELEASE_MEMORY(ctx->integral_img);
> >       CL_RELEASE_MEMORY(ctx->weight);
> >       CL_RELEASE_MEMORY(ctx->sum);
> > -    CL_RELEASE_MEMORY(ctx->overflow);
> >
> >       CL_RELEASE_QUEUE(ctx->command_queue);
> >
> >
>
> Thanks,
>
> - Mark
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

I'll fix the float = 0(.0) issues.
diff mbox series

Patch

diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
index 72bd681fd6..6d78a41e46 100644
--- a/libavfilter/opencl/nlmeans.cl
+++ b/libavfilter/opencl/nlmeans.cl
@@ -20,7 +20,7 @@  const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
                            CLK_ADDRESS_CLAMP_TO_EDGE   |
                            CLK_FILTER_NEAREST);
 
-kernel void horiz_sum(__global uint4 *integral_img,
+kernel void horiz_sum(__global float4 *integral_img,
                       __read_only image2d_t src,
                       int width,
                       int height,
@@ -31,7 +31,7 @@  kernel void horiz_sum(__global uint4 *integral_img,
     int y = get_global_id(0);
     int work_size = get_global_size(0);
 
-    uint4 sum = (uint4)(0);
+    float4 sum = 0.0;
     float4 s2;
     for (int i = 0; i < width; i++) {
         float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
@@ -39,28 +39,25 @@  kernel void horiz_sum(__global uint4 *integral_img,
         s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
         s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
         s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
-        sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
+        sum += (s1 - s2) * (s1 - s2);
         integral_img[y * width + i] = sum;
     }
 }
 
-kernel void vert_sum(__global uint4 *integral_img,
-                     __global int *overflow,
+kernel void vert_sum(__global float4 *integral_img,
                      int width,
                      int height)
 {
     int x = get_global_id(0);
-    uint4 sum = 0;
+    float4 sum = 0;
     for (int i = 0; i < height; i++) {
-        if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
-            atomic_inc(overflow);
         integral_img[i * width + x] += sum;
         sum = integral_img[i * width + x];
     }
 }
 
 kernel void weight_accum(global float *sum, global float *weight,
-                         global uint4 *integral_img, __read_only image2d_t src,
+                         global float4 *integral_img, __read_only image2d_t src,
                          int width, int height, int p, float h,
                          int4 dx, int4 dy)
 {
@@ -75,16 +72,16 @@  kernel void weight_accum(global float *sum, global float *weight,
     int y = get_global_id(1);
     int4 xoff = x + dx;
     int4 yoff = y + dy;
-    uint4 a = 0, b = 0, c = 0, d = 0;
-    uint4 src_pix = 0;
+    float4 a = 0, b = 0, c = 0, d = 0;
+    float4 src_pix = 0;
 
     // out-of-bounding-box?
     int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
 
-    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
-    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
-    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
-    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
+    src_pix.x = read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x;
+    src_pix.y = read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x;
+    src_pix.z = read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x;
+    src_pix.w = read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x;
     if (!oobb) {
         a = integral_img[(y - p) * width + x - p];
         b = integral_img[(y + p) * width + x - p];
@@ -93,7 +90,7 @@  kernel void weight_accum(global float *sum, global float *weight,
     }
 
     float4 patch_diff = convert_float4(d + a - c - b);
-    float4 w = native_exp(-patch_diff / (h * h));
+    float4 w = native_exp(-patch_diff * (float4)(255.0f * 255.0f) / (h * h));
     float w_sum = w.x + w.y + w.z + w.w;
     weight[y * width + x] += w_sum;
     sum[y * width + x] += dot(w, convert_float4(src_pix));
@@ -109,7 +106,7 @@  kernel void average(__write_only image2d_t dst,
     float w = weight[y * dim.x + x];
     float s = sum[y * dim.x + x];
     float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
-    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
+    float r = (s + src_pix) / (1.0f + w);
     if (x < dim.x && y < dim.y)
         write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
 }
diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
index e57b5e0873..0b69f3b6c4 100644
--- a/libavfilter/vf_nlmeans_opencl.c
+++ b/libavfilter/vf_nlmeans_opencl.c
@@ -30,11 +30,9 @@ 
 #include "opencl_source.h"
 #include "video.h"
 
-// TODO:
-//      the integral image may overflow 32bit, consider using 64bit
-
 static const enum AVPixelFormat supported_formats[] = {
     AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV420P16LE,
     AV_PIX_FMT_YUV444P,
     AV_PIX_FMT_GBRP,
 };
@@ -59,7 +57,6 @@  typedef struct NLMeansOpenCLContext {
     cl_mem                integral_img;
     cl_mem                weight;
     cl_mem                sum;
-    cl_mem                overflow; // overflow in integral image?
     double                sigma;
     float                 h;
     int                   chroma_w;
@@ -129,7 +126,7 @@  static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
                      "average kernel %d.\n", cle);
 
     ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
-                                       4 * width * height * sizeof(cl_int),
+                                       4 * width * height * sizeof(cl_float),
                                        NULL, &cle);
     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
                      "integral image %d.\n", cle);
@@ -144,11 +141,6 @@  static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
                      "sum buffer %d.\n", cle);
 
-    ctx->overflow = clCreateBuffer(ctx->ocf.hwctx->context, 0,
-                                   sizeof(cl_int), NULL, &cle);
-    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
-                     "overflow buffer %d.\n", cle);
-
     ctx->initialised = 1;
     return 0;
 
@@ -161,7 +153,6 @@  fail:
     CL_RELEASE_MEMORY(ctx->integral_img);
     CL_RELEASE_MEMORY(ctx->weight);
     CL_RELEASE_MEMORY(ctx->sum);
-    CL_RELEASE_MEMORY(ctx->overflow);
 
     CL_RELEASE_QUEUE(ctx->command_queue);
     return err;
@@ -239,9 +230,8 @@  static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
         // vertical pass
         // integral(x, y) = sum(integral(x, v)) for v in [0, y]
         CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
-        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
-        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
-        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &width);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &height);
         cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
                                      1, NULL, worksize2, NULL, 0, NULL, NULL);
         CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
@@ -293,8 +283,7 @@  static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
     const AVPixFmtDescriptor *desc;
     enum AVPixelFormat in_format;
     cl_mem src, dst;
-    const cl_int zero = 0;
-    int w, h, err, cle, overflow, p, patch, research;
+    int w, h, err, cle, p, patch, research;
 
     av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
            av_get_pix_fmt_name(input->format),
@@ -331,11 +320,6 @@  static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
             goto fail;
     }
 
-    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
-                               0, sizeof(cl_int), &zero, 0, NULL, NULL);
-    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
-                     "detection buffer %d.\n", cle);
-
     for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
         src = (cl_mem) input->data[p];
         dst = (cl_mem) output->data[p];
@@ -351,17 +335,10 @@  static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
         if (err < 0)
             goto fail;
     }
-    // overflow occurred?
-    cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
-                              0, sizeof(cl_int), &overflow, 0, NULL, NULL);
-    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
 
     cle = clFinish(ctx->command_queue);
     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
 
-    if (overflow > 0)
-        av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
-
     av_frame_free(&input);
 
     av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
@@ -390,7 +367,6 @@  static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
     CL_RELEASE_MEMORY(ctx->integral_img);
     CL_RELEASE_MEMORY(ctx->weight);
     CL_RELEASE_MEMORY(ctx->sum);
-    CL_RELEASE_MEMORY(ctx->overflow);
 
     CL_RELEASE_QUEUE(ctx->command_queue);