[FFmpeg-devel,V2,2/2] lavfi/opencl: add nlmeans_opencl filter

Submitted by Ruiling Song on April 12, 2019, 3:09 p.m.

Details

Message ID 20190412150930.8395-2-ruiling.song@intel.com
State New
Headers show

Commit Message

Ruiling Song April 12, 2019, 3:09 p.m.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
---
 configure                       |   1 +
 doc/filters.texi                |   4 +
 libavfilter/Makefile            |   1 +
 libavfilter/allfilters.c        |   1 +
 libavfilter/opencl/nlmeans.cl   | 115 +++++++++
 libavfilter/opencl_source.h     |   1 +
 libavfilter/vf_nlmeans_opencl.c | 442 ++++++++++++++++++++++++++++++++
 7 files changed, 565 insertions(+)
 create mode 100644 libavfilter/opencl/nlmeans.cl
 create mode 100644 libavfilter/vf_nlmeans_opencl.c

Comments

Mark Thompson April 16, 2019, 9:28 p.m.
On 12/04/2019 16:09, Ruiling Song wrote:
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>

I can't work out where the problem is, but there is something really weirdly nondeterministic going on here.

E.g.

$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -frames:v 10 -f framemd5 -
...
0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3
0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5
0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46
0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d
0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0
0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174
0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -frames:v 10 -f framemd5 -
0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
[Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow 2157538
0,          1,          1,        1, 12441600, bce72e10a9f1118940c5a8392ad78ec3
0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5
0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46
0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d
0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0
0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174
0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -frames:v 10 -f framemd5 -
0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3
[Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow 2098545
0,          2,          2,        1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca
0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46
0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d
0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0
0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174
0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

Frame 1 gave an overflow on the second run, and gets a different answer, then frame 2 in the same way on the third run?  I can't characterise when this happens, it seems to be pretty random with low probability.

(Input here is a 4K file from <http://jell.yfish.us/>, but I don't think it matters - I saw it with others sometimes as well.)

>  configure                       |   1 +
>  doc/filters.texi                |   4 +
>  libavfilter/Makefile            |   1 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/opencl/nlmeans.cl   | 115 +++++++++
>  libavfilter/opencl_source.h     |   1 +
>  libavfilter/vf_nlmeans_opencl.c | 442 ++++++++++++++++++++++++++++++++
>  7 files changed, 565 insertions(+)
>  create mode 100644 libavfilter/opencl/nlmeans.cl
>  create mode 100644 libavfilter/vf_nlmeans_opencl.c

Code all looks fine, as far as I can tell.

Thanks,

- Mark
Ruiling Song April 17, 2019, 2:43 a.m.
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Mark Thompson

> Sent: Wednesday, April 17, 2019 5:28 AM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl

> filter

> 

> On 12/04/2019 16:09, Ruiling Song wrote:

> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> 

> I can't work out where the problem is, but there is something really weirdly

> nondeterministic going on here.

> 

> E.g.

> 

> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-

> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> frames:v 10 -f framemd5 -

> ...

> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-

> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> frames:v 10 -f framemd5 -

> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow

> 2157538

> 0,          1,          1,        1, 12441600, bce72e10a9f1118940c5a8392ad78ec3

> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-

> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> frames:v 10 -f framemd5 -

> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow

> 2098545

> 0,          2,          2,        1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca

> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> 

> Frame 1 gave an overflow on the second run, and gets a different answer, then

> frame 2 in the same way on the third run?  I can't characterise when this

> happens, it seems to be pretty random with low probability.


I tried to reproduce on my SKL and KBL, with Beignet and Neo. And didn't reproduce the issue.
As I am encountering some network issue, I didn't get the video sample you provide (I am using https://4ksamples.com/ses-astra-uhd-test-2-2160p-uhdtv/ ), I can try later to download the same video as you.
May be an OpenCL driver issue? I am not sure yet. So could you provide what hardware and opencl driver version you are using? So I can do some debugging if possible.

> 

> (Input here is a 4K file from <http://jell.yfish.us/>, but I don't think it matters - I

> saw it with others sometimes as well.)

> 

> >  configure                       |   1 +

> >  doc/filters.texi                |   4 +

> >  libavfilter/Makefile            |   1 +

> >  libavfilter/allfilters.c        |   1 +

> >  libavfilter/opencl/nlmeans.cl   | 115 +++++++++

> >  libavfilter/opencl_source.h     |   1 +

> >  libavfilter/vf_nlmeans_opencl.c | 442 ++++++++++++++++++++++++++++++++

> >  7 files changed, 565 insertions(+)

> >  create mode 100644 libavfilter/opencl/nlmeans.cl

> >  create mode 100644 libavfilter/vf_nlmeans_opencl.c

> 

> Code all looks fine, as far as I can tell.

> 

> Thanks,

> 

> - Mark

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Mark Thompson April 20, 2019, 3:08 p.m.
On 17/04/2019 03:43, Song, Ruiling wrote:
>> -----Original Message-----
>> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
>> Mark Thompson
>> Sent: Wednesday, April 17, 2019 5:28 AM
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl
>> filter
>>
>> On 12/04/2019 16:09, Ruiling Song wrote:
>>> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
>>
>> I can't work out where the problem is, but there is something really weirdly
>> nondeterministic going on here.
>>
>> E.g.
>>
>> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-
>> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
>> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
>> frames:v 10 -f framemd5 -
>> ...
>> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
>> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3
>> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5
>> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
>> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46
>> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d
>> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0
>> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
>> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174
>> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
>> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-
>> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
>> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
>> frames:v 10 -f framemd5 -
>> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
>> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow
>> 2157538
>> 0,          1,          1,        1, 12441600, bce72e10a9f1118940c5a8392ad78ec3
>> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5
>> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
>> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46
>> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d
>> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0
>> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
>> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174
>> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
>> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-mbps-
>> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf
>> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -
>> frames:v 10 -f framemd5 -
>> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4
>> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3
>> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow
>> 2098545
>> 0,          2,          2,        1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca
>> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc
>> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46
>> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d
>> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0
>> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9
>> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174
>> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612
>>
>> Frame 1 gave an overflow on the second run, and gets a different answer, then
>> frame 2 in the same way on the third run?  I can't characterise when this
>> happens, it seems to be pretty random with low probability.
> 
> I tried to reproduce on my SKL and KBL, with Beignet and Neo. And didn't reproduce the issue.
> As I am encountering some network issue, I didn't get the video sample you provide (I am using https://4ksamples.com/ses-astra-uhd-test-2-2160p-uhdtv/ ), I can try later to download the same video as you.
> May be an OpenCL driver issue? I am not sure yet. So could you provide what hardware and opencl driver version you are using? So I can do some debugging if possible.

CFL-8700 with git Beignet.

It also sometimes happens with your sample (took >10 tries to get this):

$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i SES.Astra.UHD.Test.2.2160p.UHDTV.HEVC.x265-LiebeIst.mkv -an -filter_hw_device opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -frames:v 10 -f framemd5 -
...
0,          0,          0,        1, 12441600, 3eba6db2c5f693f6b3c8646a950084bc
0,          1,          1,        1, 12441600, b538be935c6bb38dbb6fdfba4ef035d1
0,          2,          2,        1, 12441600, dafec46e81cb9b50609671fd4c9db645
0,          3,          3,        1, 12441600, eaca33534b94031df566489dacacc9e5
0,          4,          4,        1, 12441600, 5e49c45c50b36516ce53c708dd16f512
0,          5,          5,        1, 12441600, 5d1be0800efd126670de20f468ae78b9
0,          6,          6,        1, 12441600, f022199f0519ff884ac2f3d8655e8489
0,          7,          7,        1, 12441600, df9daccf85ef00b99b4c086d890fbddc
0,          8,          8,        1, 12441600, 5a5b16518fce6021569e576505277a27
0,          9,          9,        1, 12441600, 095a68d27d322525e62fb182cb1b9aa1
...
$ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i SES.Astra.UHD.Test.2.2160p.UHDTV.HEVC.x265-LiebeIst.mkv -an -filter_hw_device opencl0 -vf format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -frames:v 10 -f framemd5 -
...
0,          0,          0,        1, 12441600, 3eba6db2c5f693f6b3c8646a950084bc
0,          1,          1,        1, 12441600, b538be935c6bb38dbb6fdfba4ef035d1
0,          2,          2,        1, 12441600, dafec46e81cb9b50609671fd4c9db645
0,          3,          3,        1, 12441600, eaca33534b94031df566489dacacc9e5
0,          4,          4,        1, 12441600, 5e49c45c50b36516ce53c708dd16f512
0,          5,          5,        1, 12441600, 5d1be0800efd126670de20f468ae78b9
0,          6,          6,        1, 12441600, f022199f0519ff884ac2f3d8655e8489
[Parsed_nlmeans_opencl_2 @ 0x565343792d00] integral image overflow 2943427
0,          7,          7,        1, 12441600, bdac59f2b6c73af4ea81e75e6e7cc598
0,          8,          8,        1, 12441600, 5a5b16518fce6021569e576505277a27
0,          9,          9,        1, 12441600, 095a68d27d322525e62fb182cb1b9aa1
...

I'm unable to reproduce on a Mali T760, but the probability seems to be low and that platform is significantly slower / less parallel so it's possible it's just much less likely to happen there.

Thanks,

- Mark
Ruiling Song April 21, 2019, 12:17 p.m.
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Mark Thompson

> Sent: Saturday, April 20, 2019 11:08 PM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add nlmeans_opencl

> filter

> 

> On 17/04/2019 03:43, Song, Ruiling wrote:

> >> -----Original Message-----

> >> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf

> Of

> >> Mark Thompson

> >> Sent: Wednesday, April 17, 2019 5:28 AM

> >> To: ffmpeg-devel@ffmpeg.org

> >> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> nlmeans_opencl

> >> filter

> >>

> >> On 12/04/2019 16:09, Ruiling Song wrote:

> >>> Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> >>

> >> I can't work out where the problem is, but there is something really weirdly

> >> nondeterministic going on here.

> >>

> >> E.g.

> >>

> >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-

> mbps-

> >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> >> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> >> frames:v 10 -f framemd5 -

> >> ...

> >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> >> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> >> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-

> mbps-

> >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> >> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> >> frames:v 10 -f framemd5 -

> >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> >> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image overflow

> >> 2157538

> >> 0,          1,          1,        1, 12441600, bce72e10a9f1118940c5a8392ad78ec3

> >> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-

> mbps-

> >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> >> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> >> frames:v 10 -f framemd5 -

> >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> >> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> >> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow

> >> 2098545

> >> 0,          2,          2,        1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca

> >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> >>

> >> Frame 1 gave an overflow on the second run, and gets a different answer,

> then

> >> frame 2 in the same way on the third run?  I can't characterise when this

> >> happens, it seems to be pretty random with low probability.

> >

> > I tried to reproduce on my SKL and KBL, with Beignet and Neo. And didn't

> reproduce the issue.

> > As I am encountering some network issue, I didn't get the video sample you

> provide (I am using https://4ksamples.com/ses-astra-uhd-test-2-2160p-uhdtv/ ),

> I can try later to download the same video as you.

> > May be an OpenCL driver issue? I am not sure yet. So could you provide what

> hardware and opencl driver version you are using? So I can do some debugging if

> possible.

> 

> CFL-8700 with git Beignet.

First I want to say that Beignet never declare official support of CFL, which means that CFL was not fully tested.
I guess your problem is specific to CFL, may be specific to Beignet + CFL, maybe not.
I highly recommend you to try NEO(https://github.com/intel/compute-runtime ) which officially support CFL.
If you cannot reproduce with NEO, then it would be obvious this is a bug of Beignet on CFL.
I also try jellyfish sample on KBL and SKL, both Beignet and NEO, still not reproduce the issue.
The Beignet was not developed or tested anymore. What's more the CFL support of Beignet was not tested extensively.
I will try to find one CFL machine to have a test.
> 

> It also sometimes happens with your sample (took >10 tries to get this):

> 

> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i

> SES.Astra.UHD.Test.2.2160p.UHDTV.HEVC.x265-LiebeIst.mkv -an -

> filter_hw_device opencl0 -vf

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> frames:v 10 -f framemd5 -

> ...

> 0,          0,          0,        1, 12441600, 3eba6db2c5f693f6b3c8646a950084bc

> 0,          1,          1,        1, 12441600, b538be935c6bb38dbb6fdfba4ef035d1

> 0,          2,          2,        1, 12441600, dafec46e81cb9b50609671fd4c9db645

> 0,          3,          3,        1, 12441600, eaca33534b94031df566489dacacc9e5

> 0,          4,          4,        1, 12441600, 5e49c45c50b36516ce53c708dd16f512

> 0,          5,          5,        1, 12441600, 5d1be0800efd126670de20f468ae78b9

> 0,          6,          6,        1, 12441600, f022199f0519ff884ac2f3d8655e8489

> 0,          7,          7,        1, 12441600, df9daccf85ef00b99b4c086d890fbddc

> 0,          8,          8,        1, 12441600, 5a5b16518fce6021569e576505277a27

> 0,          9,          9,        1, 12441600, 095a68d27d322525e62fb182cb1b9aa1

> ...

> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i

> SES.Astra.UHD.Test.2.2160p.UHDTV.HEVC.x265-LiebeIst.mkv -an -

> filter_hw_device opencl0 -vf

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> frames:v 10 -f framemd5 -

> ...

> 0,          0,          0,        1, 12441600, 3eba6db2c5f693f6b3c8646a950084bc

> 0,          1,          1,        1, 12441600, b538be935c6bb38dbb6fdfba4ef035d1

> 0,          2,          2,        1, 12441600, dafec46e81cb9b50609671fd4c9db645

> 0,          3,          3,        1, 12441600, eaca33534b94031df566489dacacc9e5

> 0,          4,          4,        1, 12441600, 5e49c45c50b36516ce53c708dd16f512

> 0,          5,          5,        1, 12441600, 5d1be0800efd126670de20f468ae78b9

> 0,          6,          6,        1, 12441600, f022199f0519ff884ac2f3d8655e8489

> [Parsed_nlmeans_opencl_2 @ 0x565343792d00] integral image overflow

> 2943427

> 0,          7,          7,        1, 12441600, bdac59f2b6c73af4ea81e75e6e7cc598

> 0,          8,          8,        1, 12441600, 5a5b16518fce6021569e576505277a27

> 0,          9,          9,        1, 12441600, 095a68d27d322525e62fb182cb1b9aa1

> ...

> 

> I'm unable to reproduce on a Mali T760, but the probability seems to be low and

> that platform is significantly slower / less parallel so it's possible it's just much

> less likely to happen there.

You can try with "nlmeans_opencl=r=5" to do a faster test.

> 

> Thanks,

> 

> - Mark

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Ruiling Song April 23, 2019, 8:51 a.m.
> -----Original Message-----

> From: Song, Ruiling

> Sent: Sunday, April 21, 2019 8:18 PM

> To: FFmpeg development discussions and patches <ffmpeg-

> devel@ffmpeg.org>

> Subject: RE: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> nlmeans_opencl filter

> 

> 

> 

> > -----Original Message-----

> > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On

> Behalf Of

> > Mark Thompson

> > Sent: Saturday, April 20, 2019 11:08 PM

> > To: ffmpeg-devel@ffmpeg.org

> > Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> nlmeans_opencl

> > filter

> >

> > On 17/04/2019 03:43, Song, Ruiling wrote:

> > >> -----Original Message-----

> > >> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On

> Behalf

> > Of

> > >> Mark Thompson

> > >> Sent: Wednesday, April 17, 2019 5:28 AM

> > >> To: ffmpeg-devel@ffmpeg.org

> > >> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> > nlmeans_opencl

> > >> filter

> > >>

> > >> On 12/04/2019 16:09, Ruiling Song wrote:

> > >>> Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > >>

> > >> I can't work out where the problem is, but there is something really

> weirdly

> > >> nondeterministic going on here.

> > >>

> > >> E.g.

> > >>

> > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-

> > mbps-

> > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> > >>

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > >> frames:v 10 -f framemd5 -

> > >> ...

> > >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> > >> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> > >> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> > >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> > >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> > >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> > >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> > >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> > >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> > >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-

> > mbps-

> > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> > >>

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > >> frames:v 10 -f framemd5 -

> > >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> > >> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image

> overflow

> > >> 2157538

> > >> 0,          1,          1,        1, 12441600, bce72e10a9f1118940c5a8392ad78ec3

> > >> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> > >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> > >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> > >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> > >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> > >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> > >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> > >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-120-

> > mbps-

> > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> > >>

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > >> frames:v 10 -f framemd5 -

> > >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> > >> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> > >> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image overflow

> > >> 2098545

> > >> 0,          2,          2,        1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca

> > >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> > >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> > >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> > >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> > >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> > >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> > >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> > >>

> > >> Frame 1 gave an overflow on the second run, and gets a different

> answer,

> > then

> > >> frame 2 in the same way on the third run?  I can't characterise when this

> > >> happens, it seems to be pretty random with low probability.

> > >

> > > I tried to reproduce on my SKL and KBL, with Beignet and Neo. And didn't

> > reproduce the issue.

> > > As I am encountering some network issue, I didn't get the video sample

> you

> > provide (I am using https://4ksamples.com/ses-astra-uhd-test-2-2160p-

> uhdtv/ ),

> > I can try later to download the same video as you.

> > > May be an OpenCL driver issue? I am not sure yet. So could you provide

> what

> > hardware and opencl driver version you are using? So I can do some

> debugging if

> > possible.

> >

> > CFL-8700 with git Beignet.

> First I want to say that Beignet never declare official support of CFL, which

> means that CFL was not fully tested.

> I guess your problem is specific to CFL, may be specific to Beignet + CFL,

> maybe not.

> I highly recommend you to try NEO(https://github.com/intel/compute-

> runtime ) which officially support CFL.

> If you cannot reproduce with NEO, then it would be obvious this is a bug of

> Beignet on CFL.

> I also try jellyfish sample on KBL and SKL, both Beignet and NEO, still not

> reproduce the issue.

> The Beignet was not developed or tested anymore. What's more the CFL

> support of Beignet was not tested extensively.

> I will try to find one CFL machine to have a test.

I think you are running  against Beignet “Allow creating out-of-order queues with clCreateCommandQueue”, right?
I got one CFL i5-7600k, and only make small modification to CMake file to use llvm-4.0.
And use exact command and the jellyfish video clip. Still could not reproduce.
So could you have a test against intel-compute-runtime when you have time?
Or do you have any local changes against Beignet or related software?
Which Linux kernel, libdrm, llvm version are you using?

I am guessing may be the event or asynchronous not correctly handled in Beignet,
Could you make some local modification and test on your machine?
Make the clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_TRUE,...);
Previously I use CL_FALSE. Hope synchronous write could help on this.

Thanks!
Ruiling
> >

> > It also sometimes happens with your sample (took >10 tries to get this):

> >

> > $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i

> > SES.Astra.UHD.Test.2.2160p.UHDTV.HEVC.x265-LiebeIst.mkv -an -

> > filter_hw_device opencl0 -vf

> >

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > frames:v 10 -f framemd5 -

> > ...

> > 0,          0,          0,        1, 12441600, 3eba6db2c5f693f6b3c8646a950084bc

> > 0,          1,          1,        1, 12441600, b538be935c6bb38dbb6fdfba4ef035d1

> > 0,          2,          2,        1, 12441600, dafec46e81cb9b50609671fd4c9db645

> > 0,          3,          3,        1, 12441600, eaca33534b94031df566489dacacc9e5

> > 0,          4,          4,        1, 12441600, 5e49c45c50b36516ce53c708dd16f512

> > 0,          5,          5,        1, 12441600, 5d1be0800efd126670de20f468ae78b9

> > 0,          6,          6,        1, 12441600, f022199f0519ff884ac2f3d8655e8489

> > 0,          7,          7,        1, 12441600, df9daccf85ef00b99b4c086d890fbddc

> > 0,          8,          8,        1, 12441600, 5a5b16518fce6021569e576505277a27

> > 0,          9,          9,        1, 12441600, 095a68d27d322525e62fb182cb1b9aa1

> > ...

> > $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i

> > SES.Astra.UHD.Test.2.2160p.UHDTV.HEVC.x265-LiebeIst.mkv -an -

> > filter_hw_device opencl0 -vf

> >

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > frames:v 10 -f framemd5 -

> > ...

> > 0,          0,          0,        1, 12441600, 3eba6db2c5f693f6b3c8646a950084bc

> > 0,          1,          1,        1, 12441600, b538be935c6bb38dbb6fdfba4ef035d1

> > 0,          2,          2,        1, 12441600, dafec46e81cb9b50609671fd4c9db645

> > 0,          3,          3,        1, 12441600, eaca33534b94031df566489dacacc9e5

> > 0,          4,          4,        1, 12441600, 5e49c45c50b36516ce53c708dd16f512

> > 0,          5,          5,        1, 12441600, 5d1be0800efd126670de20f468ae78b9

> > 0,          6,          6,        1, 12441600, f022199f0519ff884ac2f3d8655e8489

> > [Parsed_nlmeans_opencl_2 @ 0x565343792d00] integral image overflow

> > 2943427

> > 0,          7,          7,        1, 12441600, bdac59f2b6c73af4ea81e75e6e7cc598

> > 0,          8,          8,        1, 12441600, 5a5b16518fce6021569e576505277a27

> > 0,          9,          9,        1, 12441600, 095a68d27d322525e62fb182cb1b9aa1

> > ...

> >

> > I'm unable to reproduce on a Mali T760, but the probability seems to be low

> and

> > that platform is significantly slower / less parallel so it's possible it's just

> much

> > less likely to happen there.

> You can try with "nlmeans_opencl=r=5" to do a faster test.

> 

> >

> > Thanks,

> >

> > - Mark

> > _______________________________________________

> > ffmpeg-devel mailing list

> > ffmpeg-devel@ffmpeg.org

> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> >

> > To unsubscribe, visit link above, or email

> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Ruiling Song April 29, 2019, 2:06 a.m.
> -----Original Message-----

> From: Song, Ruiling

> Sent: Tuesday, April 23, 2019 4:52 PM

> To: 'FFmpeg development discussions and patches' <ffmpeg-

> devel@ffmpeg.org>

> Subject: RE: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> nlmeans_opencl filter

> 

> 

> 

> > -----Original Message-----

> > From: Song, Ruiling

> > Sent: Sunday, April 21, 2019 8:18 PM

> > To: FFmpeg development discussions and patches <ffmpeg-

> > devel@ffmpeg.org>

> > Subject: RE: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> > nlmeans_opencl filter

> >

> >

> >

> > > -----Original Message-----

> > > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On

> > Behalf Of

> > > Mark Thompson

> > > Sent: Saturday, April 20, 2019 11:08 PM

> > > To: ffmpeg-devel@ffmpeg.org

> > > Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> > nlmeans_opencl

> > > filter

> > >

> > > On 17/04/2019 03:43, Song, Ruiling wrote:

> > > >> -----Original Message-----

> > > >> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On

> > Behalf

> > > Of

> > > >> Mark Thompson

> > > >> Sent: Wednesday, April 17, 2019 5:28 AM

> > > >> To: ffmpeg-devel@ffmpeg.org

> > > >> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> > > nlmeans_opencl

> > > >> filter

> > > >>

> > > >> On 12/04/2019 16:09, Ruiling Song wrote:

> > > >>> Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > > >>

> > > >> I can't work out where the problem is, but there is something really

> > weirdly

> > > >> nondeterministic going on here.

> > > >>

> > > >> E.g.

> > > >>

> > > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-

> 120-

> > > mbps-

> > > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> > > >>

> >

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > > >> frames:v 10 -f framemd5 -

> > > >> ...

> > > >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> > > >> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> > > >> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> > > >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> > > >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> > > >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> > > >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> > > >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> > > >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> > > >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> > > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-

> 120-

> > > mbps-

> > > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> > > >>

> >

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > > >> frames:v 10 -f framemd5 -

> > > >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> > > >> [Parsed_nlmeans_opencl_2 @ 0x5557ae580d00] integral image

> > overflow

> > > >> 2157538

> > > >> 0,          1,          1,        1, 12441600, bce72e10a9f1118940c5a8392ad78ec3

> > > >> 0,          2,          2,        1, 12441600, b10ef2a1e5125cc67e262e086f8040b5

> > > >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> > > >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> > > >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> > > >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> > > >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> > > >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> > > >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> > > >> $ ./ffmpeg_g -y -init_hw_device opencl:0.0 -i ~/video/test/jellyfish-

> 120-

> > > mbps-

> > > >> 4k-uhd-hevc-10bit.mkv -an -filter_hw_device opencl0 -vf

> > > >>

> >

> format=yuv420p,hwupload,nlmeans_opencl,hwdownload,format=yuv420p -

> > > >> frames:v 10 -f framemd5 -

> > > >> 0,          0,          0,        1, 12441600, 8b8805818076b23ae6f80ec2b5a349d4

> > > >> 0,          1,          1,        1, 12441600, 7a7fdaa083dc337cfb6af31b643f30a3

> > > >> [Parsed_nlmeans_opencl_2 @ 0x557c51fbfe80] integral image

> overflow

> > > >> 2098545

> > > >> 0,          2,          2,        1, 12441600, 68b390535adc5cfa0f8a7942c42a47ca

> > > >> 0,          3,          3,        1, 12441600, c06b53ad90e0357e537df41b63d5b1dc

> > > >> 0,          4,          4,        1, 12441600, 5aa2da07703859a3dee080847dd17d46

> > > >> 0,          5,          5,        1, 12441600, 733364c6be6af825057e905a6092937d

> > > >> 0,          6,          6,        1, 12441600, 47edae2dec956a582b04babb745d26b0

> > > >> 0,          7,          7,        1, 12441600, 4e45fe8268df4298d06a17ab8e46c3e9

> > > >> 0,          8,          8,        1, 12441600, 960d722a3f8787c9191299a114c04174

> > > >> 0,          9,          9,        1, 12441600, e759c07ee4834a9cf94bfcb4128e7612

> > > >>

> > > >> Frame 1 gave an overflow on the second run, and gets a different

> > answer,

> > > then

> > > >> frame 2 in the same way on the third run?  I can't characterise when

> this

> > > >> happens, it seems to be pretty random with low probability.

> > > >

> > > > I tried to reproduce on my SKL and KBL, with Beignet and Neo. And

> didn't

> > > reproduce the issue.

> > > > As I am encountering some network issue, I didn't get the video sample

> > you

> > > provide (I am using https://4ksamples.com/ses-astra-uhd-test-2-2160p-

> > uhdtv/ ),

> > > I can try later to download the same video as you.

> > > > May be an OpenCL driver issue? I am not sure yet. So could you provide

> > what

> > > hardware and opencl driver version you are using? So I can do some

> > debugging if

> > > possible.

> > >

> > > CFL-8700 with git Beignet.

> > First I want to say that Beignet never declare official support of CFL, which

> > means that CFL was not fully tested.

> > I guess your problem is specific to CFL, may be specific to Beignet + CFL,

> > maybe not.

> > I highly recommend you to try NEO(https://github.com/intel/compute-

> > runtime ) which officially support CFL.

> > If you cannot reproduce with NEO, then it would be obvious this is a bug of

> > Beignet on CFL.

> > I also try jellyfish sample on KBL and SKL, both Beignet and NEO, still not

> > reproduce the issue.

> > The Beignet was not developed or tested anymore. What's more the CFL

> > support of Beignet was not tested extensively.

> > I will try to find one CFL machine to have a test.

> I think you are running  against Beignet “Allow creating out-of-order queues

> with clCreateCommandQueue”, right?

> I got one CFL i5-7600k, and only make small modification to CMake file to use

> llvm-4.0.

> And use exact command and the jellyfish video clip. Still could not reproduce.

> So could you have a test against intel-compute-runtime when you have time?

> Or do you have any local changes against Beignet or related software?

> Which Linux kernel, libdrm, llvm version are you using?

> 

> I am guessing may be the event or asynchronous not correctly handled in

> Beignet,

> Could you make some local modification and test on your machine?

> Make the clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow,

> CL_TRUE,...);

> Previously I use CL_FALSE. Hope synchronous write could help on this.


In order to verify the patch, I also have more testing on the CPU OpenCL driver from Intel.
I make it run 100 times, and still not see any reported overflow. So I think we can say the filter is in good quality to be merged. Any different idea?

Thanks!
Ruiling
Ruiling Song May 5, 2019, 6:27 a.m.
Will apply.

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Mark Thompson May 6, 2019, 2:19 p.m.
On 29/04/2019 03:06, Song, Ruiling wrote:> 
> In order to verify the patch, I also have more testing on the CPU OpenCL driver from Intel.
> I make it run 100 times, and still not see any reported overflow. So I think we can say the filter is in good quality to be merged. Any different idea?

I've tried a lot more times on some additional platforms (Skylake-GT3, Mali-G52) and I can't reproduce it on anything else.  So, I think I agree that it must be a driver issue and shouldn't block anything.


On 12/04/2019 16:09, Ruiling Song wrote:
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---
>  configure                       |   1 +
>  doc/filters.texi                |   4 +
>  libavfilter/Makefile            |   1 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/opencl/nlmeans.cl   | 115 +++++++++
>  libavfilter/opencl_source.h     |   1 +
>  libavfilter/vf_nlmeans_opencl.c | 442 ++++++++++++++++++++++++++++++++
>  7 files changed, 565 insertions(+)
>  create mode 100644 libavfilter/opencl/nlmeans.cl
>  create mode 100644 libavfilter/vf_nlmeans_opencl.c
> 
> ...
> +
> +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
> +                         cl_int width, cl_int height, cl_int p, cl_int r)
> +{
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    const float zero = 0.0f;
> +    const size_t worksize1[] = {height};
> +    const size_t worksize2[] = {width};
> +    const size_t worksize3[2] = {width, height};
> +    int dx, dy, err = 0, weight_buf_size;
> +    cl_int cle;
> +    int nb_pixel, *tmp, idx = 0;
> +    cl_int *dxdy;
> +
> +    weight_buf_size = width * height * sizeof(float);
> +    cle = clEnqueueFillBuffer(ctx->command_queue, ctx->weight,
> +                              &zero, sizeof(float), 0, weight_buf_size,
> +                              0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
> +                     cle);
> +    cle = clEnqueueFillBuffer(ctx->command_queue, ctx->sum,
> +                              &zero, sizeof(float), 0, weight_buf_size,
> +                              0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
> +                     cle);
> +
> +    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
> +    dxdy = av_malloc(nb_pixel * 2 * sizeof(cl_int));
> +    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
> +
> +    if (!dxdy || !tmp)
> +        goto fail;
> +
> +    for (dx = -r; dx <= r; dx++) {
> +        for (dy = -r; dy <= r; dy++) {
> +            if (dx || dy) {
> +                tmp[idx++] = dx;
> +                tmp[idx++] = dy;
> +            }
> +        }
> +    }
> +    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
> +    for (int i = 0; i < nb_pixel / 4; i++) {
> +        dxdy[i * 8] = tmp[i * 8];         // dx0
> +        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
> +        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
> +        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
> +        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
> +        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
> +        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
> +        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
> +    }
> +    av_freep(&tmp);
> +
> +    for (int i = 0; i < nb_pixel / 4; i++) {
> +        int *dx_cur = dxdy + 8 * i;
> +        int *dy_cur = dxdy + 8 * i + 4;

cl_int.

> +
> +        // horizontal pass
> +        // integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x]
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ctx->integral_img);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &width);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &height);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
> +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel, 1,
> +                               NULL, worksize1, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel: %d.\n",
> +                         cle);
> +        // vertical pass
> +        // integral(x, y) = sum(integral(x, v)) for v in [0, y]
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
> +                                     1, NULL, worksize2, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
> +                         cle);
> +
> +        // accumlate weights
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &ctx->sum);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &ctx->weight);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ctx->integral_img);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &width);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &height);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);
> +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->accum_kernel,
> +                                     2, NULL, worksize3, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
> +    }
> +    av_freep(&dxdy);
> +
> +    // average
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &ctx->sum);
> +    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &ctx->weight);
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->average_kernel, 2,
> +                                 NULL, worksize3, NULL, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average kernel: %d.\n",
> +                     cle);
> +    cle = clFlush(ctx->command_queue);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to flush command queue: %d.\n", cle);
> +fail:
> +    if (tmp)
> +      av_freep(&tmp);
> +    if (dxdy)
> +      av_freep(&dxdy);

Funny indent.

> +    return err;
> +}
> +
> +static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    NLMeansOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    AVHWFramesContext *input_frames_ctx;
> +    const AVPixFmtDescriptor *desc;
> +    enum AVPixelFormat in_format;
> +    cl_mem src, dst;
> +    const cl_int zero = 0;
> +    int w, h, err, cle, overflow, p, patch, research;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;
> +    in_format = input_frames_ctx->sw_format;
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!ctx->initialised) {
> +        desc = av_pix_fmt_desc_get(in_format);
> +        if (!is_format_supported(in_format)) {
> +            err = AVERROR(EINVAL);
> +            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",
> +                   av_get_pix_fmt_name(in_format));
> +            goto fail;
> +        }
> +        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
> +        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
> +
> +        err = nlmeans_opencl_init(avctx, inlink->w, inlink->h);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> +                               0, sizeof(cl_int), &zero, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
> +                     "detection buffer %d.\n", cle);
> +
> +    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
> +        src = (cl_mem) input->data[p];
> +        dst = (cl_mem) output->data[p];
> +
> +        if (!dst)
> +            break;

I think I'd assert that src is not null here as well, just in case.

> +        w = p ? ctx->chroma_w : inlink->w;
> +        h = p ? ctx->chroma_h : inlink->h;
> +        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;
> +        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;

Is this intended for the GBRP case?  Intuitively I would expect it to treat each of GBR the same, but maybe it's preferable for green to be special somehow.

> +        err = nlmeans_plane(avctx, dst, src, w, h, patch, research);
> +        if (err < 0)
> +            goto fail;
> +    }
> +    // overflow occured?
> +    cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> +                              0, sizeof(cl_int), &overflow, 0, NULL, NULL);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
> +
> +    cle = clFinish(ctx->command_queue);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
> +
> +    if (overflow > 0)
> +      av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
> +
> +    av_frame_free(&input);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> ...

Thanks,

- Mark
Ruiling Song May 7, 2019, 1:06 a.m.
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf

> Of Mark Thompson

> Sent: Monday, May 6, 2019 10:20 PM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH V2 2/2] lavfi/opencl: add

> nlmeans_opencl filter

> 

> On 29/04/2019 03:06, Song, Ruiling wrote:>

> > In order to verify the patch, I also have more testing on the CPU OpenCL

> driver from Intel.

> > I make it run 100 times, and still not see any reported overflow. So I think

> we can say the filter is in good quality to be merged. Any different idea?

> 

> I've tried a lot more times on some additional platforms (Skylake-GT3, Mali-

> G52) and I can't reproduce it on anything else.  So, I think I agree that it must

> be a driver issue and shouldn't block anything.

> 

> 

> On 12/04/2019 16:09, Ruiling Song wrote:

> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > ---

> >  configure                       |   1 +

> >  doc/filters.texi                |   4 +

> >  libavfilter/Makefile            |   1 +

> >  libavfilter/allfilters.c        |   1 +

> >  libavfilter/opencl/nlmeans.cl   | 115 +++++++++

> >  libavfilter/opencl_source.h     |   1 +

> >  libavfilter/vf_nlmeans_opencl.c | 442

> ++++++++++++++++++++++++++++++++

> >  7 files changed, 565 insertions(+)

> >  create mode 100644 libavfilter/opencl/nlmeans.cl

> >  create mode 100644 libavfilter/vf_nlmeans_opencl.c

> >

> > ...

> > +

> > +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,

> > +                         cl_int width, cl_int height, cl_int p, cl_int r)

> > +{

> > +    NLMeansOpenCLContext *ctx = avctx->priv;

> > +    const float zero = 0.0f;

> > +    const size_t worksize1[] = {height};

> > +    const size_t worksize2[] = {width};

> > +    const size_t worksize3[2] = {width, height};

> > +    int dx, dy, err = 0, weight_buf_size;

> > +    cl_int cle;

> > +    int nb_pixel, *tmp, idx = 0;

> > +    cl_int *dxdy;

> > +

> > +    weight_buf_size = width * height * sizeof(float);

> > +    cle = clEnqueueFillBuffer(ctx->command_queue, ctx->weight,

> > +                              &zero, sizeof(float), 0, weight_buf_size,

> > +                              0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",

> > +                     cle);

> > +    cle = clEnqueueFillBuffer(ctx->command_queue, ctx->sum,

> > +                              &zero, sizeof(float), 0, weight_buf_size,

> > +                              0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",

> > +                     cle);

> > +

> > +    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;

> > +    dxdy = av_malloc(nb_pixel * 2 * sizeof(cl_int));

> > +    tmp = av_malloc(nb_pixel * 2 * sizeof(int));

> > +

> > +    if (!dxdy || !tmp)

> > +        goto fail;

> > +

> > +    for (dx = -r; dx <= r; dx++) {

> > +        for (dy = -r; dy <= r; dy++) {

> > +            if (dx || dy) {

> > +                tmp[idx++] = dx;

> > +                tmp[idx++] = dy;

> > +            }

> > +        }

> > +    }

> > +    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a

> batch

> > +    for (int i = 0; i < nb_pixel / 4; i++) {

> > +        dxdy[i * 8] = tmp[i * 8];         // dx0

> > +        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1

> > +        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2

> > +        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3

> > +        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0

> > +        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1

> > +        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2

> > +        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3

> > +    }

> > +    av_freep(&tmp);

> > +

> > +    for (int i = 0; i < nb_pixel / 4; i++) {

> > +        int *dx_cur = dxdy + 8 * i;

> > +        int *dy_cur = dxdy + 8 * i + 4;

> 

> cl_int.

Fixed
> 

> > +

> > +        // horizontal pass

> > +        // integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x]

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ctx-

> >integral_img);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &width);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &height);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);

> > +        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);

> > +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx-

> >horiz_kernel, 1,

> > +                               NULL, worksize1, NULL, 0, NULL, NULL);

> > +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue

> horiz_kernel: %d.\n",

> > +                         cle);

> > +        // vertical pass

> > +        // integral(x, y) = sum(integral(x, v)) for v in [0, y]

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx-

> >integral_img);

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);

> > +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);

> > +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx-

> >vert_kernel,

> > +                                     1, NULL, worksize2, NULL, 0, NULL, NULL);

> > +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue

> vert_kernel: %d.\n",

> > +                         cle);

> > +

> > +        // accumlate weights

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &ctx->sum);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &ctx->weight);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ctx-

> >integral_img);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &width);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &height);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);

> > +        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);

> > +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx-

> >accum_kernel,

> > +                                     2, NULL, worksize3, NULL, 0, NULL, NULL);

> > +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue

> kernel: %d.\n", cle);

> > +    }

> > +    av_freep(&dxdy);

> > +

> > +    // average

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &ctx->sum);

> > +    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &ctx->weight);

> > +    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx-

> >average_kernel, 2,

> > +                                 NULL, worksize3, NULL, 0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average

> kernel: %d.\n",

> > +                     cle);

> > +    cle = clFlush(ctx->command_queue);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to flush command

> queue: %d.\n", cle);

> > +fail:

> > +    if (tmp)

> > +      av_freep(&tmp);

> > +    if (dxdy)

> > +      av_freep(&dxdy);

> 

> Funny indent.

Fixed
> 

> > +    return err;

> > +}

> > +

> > +static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame

> *input)

> > +{

> > +    AVFilterContext    *avctx = inlink->dst;

> > +    AVFilterLink     *outlink = avctx->outputs[0];

> > +    NLMeansOpenCLContext *ctx = avctx->priv;

> > +    AVFrame *output = NULL;

> > +    AVHWFramesContext *input_frames_ctx;

> > +    const AVPixFmtDescriptor *desc;

> > +    enum AVPixelFormat in_format;

> > +    cl_mem src, dst;

> > +    const cl_int zero = 0;

> > +    int w, h, err, cle, overflow, p, patch, research;

> > +

> > +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",

> > +           av_get_pix_fmt_name(input->format),

> > +           input->width, input->height, input->pts);

> > +

> > +    if (!input->hw_frames_ctx)

> > +        return AVERROR(EINVAL);

> > +    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx-

> >data;

> > +    in_format = input_frames_ctx->sw_format;

> > +

> > +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);

> > +    if (!output) {

> > +        err = AVERROR(ENOMEM);

> > +        goto fail;

> > +    }

> > +

> > +    err = av_frame_copy_props(output, input);

> > +    if (err < 0)

> > +        goto fail;

> > +

> > +    if (!ctx->initialised) {

> > +        desc = av_pix_fmt_desc_get(in_format);

> > +        if (!is_format_supported(in_format)) {

> > +            err = AVERROR(EINVAL);

> > +            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",

> > +                   av_get_pix_fmt_name(in_format));

> > +            goto fail;

> > +        }

> > +        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);

> > +        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);

> > +

> > +        err = nlmeans_opencl_init(avctx, inlink->w, inlink->h);

> > +        if (err < 0)

> > +            goto fail;

> > +    }

> > +

> > +    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow,

> CL_FALSE,

> > +                               0, sizeof(cl_int), &zero, 0, NULL, NULL);

> > +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"

> > +                     "detection buffer %d.\n", cle);

> > +

> > +    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {

> > +        src = (cl_mem) input->data[p];

> > +        dst = (cl_mem) output->data[p];

> > +

> > +        if (!dst)

> > +            break;

> 

> I think I'd assert that src is not null here as well, just in case.

I have not seen other filter asserting against this. do you have specific concern?
Anyway, I am ok to add the assert here.
> 

> > +        w = p ? ctx->chroma_w : inlink->w;

> > +        h = p ? ctx->chroma_h : inlink->h;

> > +        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;

> > +        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;

> 

> Is this intended for the GBRP case?  Intuitively I would expect it to treat each

> of GBR the same, but maybe it's preferable for green to be special somehow.

No, users are allowed to set different research window and patch size for chroma plane through "pc" and "rc" options.

> > ...

> 

> Thanks,

> 

> - Mark

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Patch hide | download patch | download mbox

diff --git a/configure b/configure
index 0cdf0ffa8a..93ebfd6784 100755
--- a/configure
+++ b/configure
@@ -3461,6 +3461,7 @@  mpdecimate_filter_select="pixelutils"
 minterpolate_filter_select="scene_sad"
 mptestsrc_filter_deps="gpl"
 negate_filter_deps="lut_filter"
+nlmeans_opencl_filter_deps="opencl"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
diff --git a/doc/filters.texi b/doc/filters.texi
index 867607d870..21c2c1a4b5 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -19030,6 +19030,10 @@  Apply erosion filter with threshold0 set to 30, threshold1 set 40, threshold2 se
 @end example
 @end itemize
 
+@section nlmeans_opencl
+
+Non-local Means denoise filter through OpenCL, this filter accepts same options as @ref{nlmeans}.
+
 @section overlay_opencl
 
 Overlay one video on top of another.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index fef6ec5c55..92039bfdcf 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -291,6 +291,7 @@  OBJS-$(CONFIG_MIX_FILTER)                    += vf_mix.o
 OBJS-$(CONFIG_MPDECIMATE_FILTER)             += vf_mpdecimate.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
+OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
 OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c51ae0f3c7..2a6390c92d 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -277,6 +277,7 @@  extern AVFilter ff_vf_mix;
 extern AVFilter ff_vf_mpdecimate;
 extern AVFilter ff_vf_negate;
 extern AVFilter ff_vf_nlmeans;
+extern AVFilter ff_vf_nlmeans_opencl;
 extern AVFilter ff_vf_nnedi;
 extern AVFilter ff_vf_noformat;
 extern AVFilter ff_vf_noise;
diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
new file mode 100644
index 0000000000..72bd681fd6
--- /dev/null
+++ b/libavfilter/opencl/nlmeans.cl
@@ -0,0 +1,115 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                           CLK_ADDRESS_CLAMP_TO_EDGE   |
+                           CLK_FILTER_NEAREST);
+
+kernel void horiz_sum(__global uint4 *integral_img,
+                      __read_only image2d_t src,
+                      int width,
+                      int height,
+                      int4 dx,
+                      int4 dy)
+{
+
+    int y = get_global_id(0);
+    int work_size = get_global_size(0);
+
+    uint4 sum = (uint4)(0);
+    float4 s2;
+    for (int i = 0; i < width; i++) {
+        float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
+        s2.x = read_imagef(src, sampler, (int2)(i + dx.x, y + dy.x)).x;
+        s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
+        s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
+        s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
+        sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
+        integral_img[y * width + i] = sum;
+    }
+}
+
+kernel void vert_sum(__global uint4 *integral_img,
+                     __global int *overflow,
+                     int width,
+                     int height)
+{
+    int x = get_global_id(0);
+    uint4 sum = 0;
+    for (int i = 0; i < height; i++) {
+        if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
+            atomic_inc(overflow);
+        integral_img[i * width + x] += sum;
+        sum = integral_img[i * width + x];
+    }
+}
+
+kernel void weight_accum(global float *sum, global float *weight,
+                         global uint4 *integral_img, __read_only image2d_t src,
+                         int width, int height, int p, float h,
+                         int4 dx, int4 dy)
+{
+    // w(x) = integral_img(x-p, y-p) +
+    //        integral_img(x+p, y+p) -
+    //        integral_img(x+p, y-p) -
+    //        integral_img(x-p, y+p)
+    // total_sum[x] += w(x, y) * src(x + dx, y + dy)
+    // total_weight += w(x, y)
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int4 xoff = x + dx;
+    int4 yoff = y + dy;
+    uint4 a = 0, b = 0, c = 0, d = 0;
+    uint4 src_pix = 0;
+
+    // out-of-bounding-box?
+    int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
+
+    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
+    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
+    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
+    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
+    if (!oobb) {
+        a = integral_img[(y - p) * width + x - p];
+        b = integral_img[(y + p) * width + x - p];
+        c = integral_img[(y - p) * width + x + p];
+        d = integral_img[(y + p) * width + x + p];
+    }
+
+    float4 patch_diff = convert_float4(d + a - c - b);
+    float4 w = native_exp(-patch_diff / (h * h));
+    float w_sum = w.x + w.y + w.z + w.w;
+    weight[y * width + x] += w_sum;
+    sum[y * width + x] += dot(w, convert_float4(src_pix));
+}
+
+kernel void average(__write_only image2d_t dst,
+                    __read_only image2d_t src,
+                    global float *sum, global float *weight) {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int2 dim = get_image_dim(dst);
+
+    float w = weight[y * dim.x + x];
+    float s = sum[y * dim.x + x];
+    float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
+    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
+    if (x < dim.x && y < dim.y)
+        write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 4118138c30..fd40fd7dca 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -23,6 +23,7 @@  extern const char *ff_opencl_source_avgblur;
 extern const char *ff_opencl_source_colorspace_common;
 extern const char *ff_opencl_source_convolution;
 extern const char *ff_opencl_source_neighbor;
+extern const char *ff_opencl_source_nlmeans;
 extern const char *ff_opencl_source_overlay;
 extern const char *ff_opencl_source_tonemap;
 extern const char *ff_opencl_source_transpose;
diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
new file mode 100644
index 0000000000..0cb2075e4d
--- /dev/null
+++ b/libavfilter/vf_nlmeans_opencl.c
@@ -0,0 +1,442 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <float.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "video.h"
+
+// TODO:
+//      the integral image may overflow 32bit, consider using 64bit
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+};
+
+static int is_format_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+typedef struct NLMeansOpenCLContext {
+    OpenCLFilterContext   ocf;
+    int                   initialised;
+    cl_kernel             vert_kernel;
+    cl_kernel             horiz_kernel;
+    cl_kernel             accum_kernel;
+    cl_kernel             average_kernel;
+    cl_mem                integral_img;
+    cl_mem                weight;
+    cl_mem                sum;
+    cl_mem                overflow; // overflow in integral image?
+    double                sigma;
+    float                 h;
+    int                   chroma_w;
+    int                   chroma_h;
+    int                   patch_size;
+    int                   patch_size_uv;
+    int                   research_size;
+    int                   research_size_uv;
+    cl_command_queue      command_queue;
+} NLMeansOpenCLContext;
+
+static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
+{
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+    int err;
+    int weight_buf_size = width * height * sizeof(float);
+
+    ctx->h = ctx->sigma * 10;
+    if (!(ctx->research_size & 1)) {
+        ctx->research_size |= 1;
+        av_log(avctx, AV_LOG_WARNING,
+               "research_size should be odd, set to %d",
+               ctx->research_size);
+    }
+
+    if (!(ctx->patch_size & 1)) {
+        ctx->patch_size |= 1;
+        av_log(avctx, AV_LOG_WARNING,
+               "patch_size should be odd, set to %d",
+               ctx->patch_size);
+    }
+
+    if (!ctx->research_size_uv)
+        ctx->research_size_uv = ctx->research_size;
+    if (!ctx->patch_size_uv)
+        ctx->patch_size_uv = ctx->patch_size;
+
+    err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_nlmeans, 1);
+    if (err < 0)
+        goto fail;
+
+    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+                                              ctx->ocf.hwctx->device_id,
+                                              0, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
+                     "command queue %d.\n", cle);
+
+    ctx->vert_kernel = clCreateKernel(ctx->ocf.program,
+                                      "vert_sum", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "vert_sum kernel %d.\n", cle);
+
+    ctx->horiz_kernel = clCreateKernel(ctx->ocf.program,
+                                       "horiz_sum", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "horiz_sum kernel %d.\n", cle);
+
+    ctx->accum_kernel = clCreateKernel(ctx->ocf.program,
+                                       "weight_accum", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "accum kernel %d.\n", cle);
+
+    ctx->average_kernel = clCreateKernel(ctx->ocf.program,
+                                         "average", &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "average kernel %d.\n", cle);
+
+    ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
+                                       4 * width * height * sizeof(cl_int),
+                                       NULL, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "integral image %d.\n", cle);
+
+    ctx->weight = clCreateBuffer(ctx->ocf.hwctx->context, 0,
+                                 weight_buf_size, NULL, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "weight buffer %d.\n", cle);
+
+    ctx->sum = clCreateBuffer(ctx->ocf.hwctx->context, 0,
+                              weight_buf_size, NULL, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "sum buffer %d.\n", cle);
+
+    ctx->overflow = clCreateBuffer(ctx->ocf.hwctx->context, 0,
+                                   sizeof(cl_int), NULL, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
+                     "overflow buffer %d.\n", cle);
+
+    ctx->initialised = 1;
+    return 0;
+
+fail:
+    CL_RELEASE_KERNEL(ctx->vert_kernel);
+    CL_RELEASE_KERNEL(ctx->horiz_kernel);
+    CL_RELEASE_KERNEL(ctx->accum_kernel);
+    CL_RELEASE_KERNEL(ctx->average_kernel);
+
+    CL_RELEASE_MEMORY(ctx->integral_img);
+    CL_RELEASE_MEMORY(ctx->weight);
+    CL_RELEASE_MEMORY(ctx->sum);
+    CL_RELEASE_MEMORY(ctx->overflow);
+
+    CL_RELEASE_QUEUE(ctx->command_queue);
+    return err;
+}
+
+static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
+                         cl_int width, cl_int height, cl_int p, cl_int r)
+{
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    const float zero = 0.0f;
+    const size_t worksize1[] = {height};
+    const size_t worksize2[] = {width};
+    const size_t worksize3[2] = {width, height};
+    int dx, dy, err = 0, weight_buf_size;
+    cl_int cle;
+    int nb_pixel, *tmp, idx = 0;
+    cl_int *dxdy;
+
+    weight_buf_size = width * height * sizeof(float);
+    cle = clEnqueueFillBuffer(ctx->command_queue, ctx->weight,
+                              &zero, sizeof(float), 0, weight_buf_size,
+                              0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n",
+                     cle);
+    cle = clEnqueueFillBuffer(ctx->command_queue, ctx->sum,
+                              &zero, sizeof(float), 0, weight_buf_size,
+                              0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n",
+                     cle);
+
+    nb_pixel = (2 * r + 1) * (2 * r + 1) - 1;
+    dxdy = av_malloc(nb_pixel * 2 * sizeof(cl_int));
+    tmp = av_malloc(nb_pixel * 2 * sizeof(int));
+
+    if (!dxdy || !tmp)
+        goto fail;
+
+    for (dx = -r; dx <= r; dx++) {
+        for (dy = -r; dy <= r; dy++) {
+            if (dx || dy) {
+                tmp[idx++] = dx;
+                tmp[idx++] = dy;
+            }
+        }
+    }
+    // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
+    for (int i = 0; i < nb_pixel / 4; i++) {
+        dxdy[i * 8] = tmp[i * 8];         // dx0
+        dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1
+        dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2
+        dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3
+        dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0
+        dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1
+        dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2
+        dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3
+    }
+    av_freep(&tmp);
+
+    for (int i = 0; i < nb_pixel / 4; i++) {
+        int *dx_cur = dxdy + 8 * i;
+        int *dy_cur = dxdy + 8 * i + 4;
+
+        // horizontal pass
+        // integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x]
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ctx->integral_img);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &width);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &height);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur);
+        CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur);
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel, 1,
+                               NULL, worksize1, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel: %d.\n",
+                         cle);
+        // vertical pass
+        // integral(x, y) = sum(integral(x, v)) for v in [0, y]
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
+        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
+                                     1, NULL, worksize2, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
+                         cle);
+
+        // accumlate weights
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &ctx->sum);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &ctx->weight);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ctx->integral_img);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &width);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &height);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur);
+        CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur);
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->accum_kernel,
+                                     2, NULL, worksize3, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
+    }
+    av_freep(&dxdy);
+
+    // average
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst);
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src);
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &ctx->sum);
+    CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &ctx->weight);
+    cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->average_kernel, 2,
+                                 NULL, worksize3, NULL, 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average kernel: %d.\n",
+                     cle);
+    cle = clFlush(ctx->command_queue);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to flush command queue: %d.\n", cle);
+fail:
+    if (tmp)
+      av_freep(&tmp);
+    if (dxdy)
+      av_freep(&dxdy);
+    return err;
+}
+
+static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
+{
+    AVFilterContext    *avctx = inlink->dst;
+    AVFilterLink     *outlink = avctx->outputs[0];
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    AVFrame *output = NULL;
+    AVHWFramesContext *input_frames_ctx;
+    const AVPixFmtDescriptor *desc;
+    enum AVPixelFormat in_format;
+    cl_mem src, dst;
+    const cl_int zero = 0;
+    int w, h, err, cle, overflow, p, patch, research;
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(input->format),
+           input->width, input->height, input->pts);
+
+    if (!input->hw_frames_ctx)
+        return AVERROR(EINVAL);
+    input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data;
+    in_format = input_frames_ctx->sw_format;
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, input);
+    if (err < 0)
+        goto fail;
+
+    if (!ctx->initialised) {
+        desc = av_pix_fmt_desc_get(in_format);
+        if (!is_format_supported(in_format)) {
+            err = AVERROR(EINVAL);
+            av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n",
+                   av_get_pix_fmt_name(in_format));
+            goto fail;
+        }
+        ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
+        ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
+
+        err = nlmeans_opencl_init(avctx, inlink->w, inlink->h);
+        if (err < 0)
+            goto fail;
+    }
+
+    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
+                               0, sizeof(cl_int), &zero, 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
+                     "detection buffer %d.\n", cle);
+
+    for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
+        src = (cl_mem) input->data[p];
+        dst = (cl_mem) output->data[p];
+
+        if (!dst)
+            break;
+        w = p ? ctx->chroma_w : inlink->w;
+        h = p ? ctx->chroma_h : inlink->h;
+        patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2;
+        research = (p ? ctx->research_size_uv : ctx->research_size) / 2;
+        err = nlmeans_plane(avctx, dst, src, w, h, patch, research);
+        if (err < 0)
+            goto fail;
+    }
+    // overflow occured?
+    cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
+                              0, sizeof(cl_int), &overflow, 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
+
+    cle = clFinish(ctx->command_queue);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
+
+    if (overflow > 0)
+      av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
+
+    av_frame_free(&input);
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(output->format),
+           output->width, output->height, output->pts);
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    clFinish(ctx->command_queue);
+    av_frame_free(&input);
+    av_frame_free(&output);
+    return err;
+}
+
+static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
+{
+    NLMeansOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+
+    CL_RELEASE_KERNEL(ctx->vert_kernel);
+    CL_RELEASE_KERNEL(ctx->horiz_kernel);
+    CL_RELEASE_KERNEL(ctx->accum_kernel);
+    CL_RELEASE_KERNEL(ctx->average_kernel);
+
+    CL_RELEASE_MEMORY(ctx->integral_img);
+    CL_RELEASE_MEMORY(ctx->weight);
+    CL_RELEASE_MEMORY(ctx->sum);
+    CL_RELEASE_MEMORY(ctx->overflow);
+
+    CL_RELEASE_QUEUE(ctx->command_queue);
+
+    ff_opencl_filter_uninit(avctx);
+}
+
+#define OFFSET(x) offsetof(NLMeansOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption nlmeans_opencl_options[] = {
+    { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
+    { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT, { .i64 = 2*3+1 }, 0, 99, FLAGS },
+    { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { "r",  "research window",                   OFFSET(research_size),    AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
+    { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nlmeans_opencl);
+
+static const AVFilterPad nlmeans_opencl_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = &nlmeans_opencl_filter_frame,
+        .config_props = &ff_opencl_filter_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad nlmeans_opencl_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = &ff_opencl_filter_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_nlmeans_opencl = {
+    .name           = "nlmeans_opencl",
+    .description    = NULL_IF_CONFIG_SMALL("Non-local means denoiser through OpenCL"),
+    .priv_size      = sizeof(NLMeansOpenCLContext),
+    .priv_class     = &nlmeans_opencl_class,
+    .init           = &ff_opencl_filter_init,
+    .uninit         = &nlmeans_opencl_uninit,
+    .query_formats  = &ff_opencl_filter_query_formats,
+    .inputs         = nlmeans_opencl_inputs,
+    .outputs        = nlmeans_opencl_outputs,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};