diff mbox

[FFmpeg-devel] lavfi: add opencl tonemap filter.

Message ID 1526885417-12826-1-git-send-email-ruiling.song@intel.com
State Superseded
Headers show

Commit Message

Ruiling Song May 21, 2018, 6:50 a.m. UTC
This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.

An example command to use this filter with vaapi codecs:
FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
vaapi -i INPUT -filter_hw_device ocl -filter_complex \
'[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
[x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT

Signed-off-by: Ruiling Song <ruiling.song@intel.com>
---
 configure                              |   1 +
 libavfilter/Makefile                   |   2 +
 libavfilter/allfilters.c               |   1 +
 libavfilter/colorspace_basic.c         |  89 ++++++
 libavfilter/colorspace_basic.h         |  40 +++
 libavfilter/opencl/colorspace_basic.cl | 179 +++++++++++
 libavfilter/opencl/tonemap.cl          | 258 +++++++++++++++
 libavfilter/opencl_source.h            |   2 +
 libavfilter/vf_tonemap_opencl.c        | 560 +++++++++++++++++++++++++++++++++
 9 files changed, 1132 insertions(+)
 create mode 100644 libavfilter/colorspace_basic.c
 create mode 100644 libavfilter/colorspace_basic.h
 create mode 100644 libavfilter/opencl/colorspace_basic.cl
 create mode 100644 libavfilter/opencl/tonemap.cl
 create mode 100644 libavfilter/vf_tonemap_opencl.c

Comments

Jun Zhao May 21, 2018, 7:23 a.m. UTC | #1
2018-05-21 14:50 GMT+08:00 Ruiling Song <ruiling.song@intel.com>:
> This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
>
> An example command to use this filter with vaapi codecs:
> FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
> opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
> vaapi -i INPUT -filter_hw_device ocl -filter_complex \
> '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
> [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT
>
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---
>  configure                              |   1 +
>  libavfilter/Makefile                   |   2 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/colorspace_basic.c         |  89 ++++++
>  libavfilter/colorspace_basic.h         |  40 +++
>  libavfilter/opencl/colorspace_basic.cl | 179 +++++++++++
>  libavfilter/opencl/tonemap.cl          | 258 +++++++++++++++
>  libavfilter/opencl_source.h            |   2 +
>  libavfilter/vf_tonemap_opencl.c        | 560 +++++++++++++++++++++++++++++++++
>  9 files changed, 1132 insertions(+)
>  create mode 100644 libavfilter/colorspace_basic.c
>  create mode 100644 libavfilter/colorspace_basic.h
>  create mode 100644 libavfilter/opencl/colorspace_basic.cl
>  create mode 100644 libavfilter/opencl/tonemap.cl
>  create mode 100644 libavfilter/vf_tonemap_opencl.c
>
> diff --git a/configure b/configure
> index e52f8f8..ee3586b 100755
> --- a/configure
> +++ b/configure
> @@ -3401,6 +3401,7 @@ tinterlace_filter_deps="gpl"
>  tinterlace_merge_test_deps="tinterlace_filter"
>  tinterlace_pad_test_deps="tinterlace_filter"
>  tonemap_filter_deps="const_nan"
> +tonemap_opencl_filter_deps="opencl"
>  unsharp_opencl_filter_deps="opencl"
>  uspp_filter_deps="gpl avcodec"
>  vaguedenoiser_filter_deps="gpl"
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index c68ef05..0915656 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -352,6 +352,8 @@ OBJS-$(CONFIG_TINTERLACE_FILTER)             += vf_tinterlace.o
>  OBJS-$(CONFIG_TLUT2_FILTER)                  += vf_lut2.o framesync.o
>  OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
>  OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
> +OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o colorspace_basic.o opencl.o \
> +                                                opencl/tonemap.o opencl/colorspace_basic.o
>  OBJS-$(CONFIG_TRANSPOSE_FILTER)              += vf_transpose.o
>  OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
>  OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index b44093d..6873bab 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -343,6 +343,7 @@ extern AVFilter ff_vf_tinterlace;
>  extern AVFilter ff_vf_tlut2;
>  extern AVFilter ff_vf_tmix;
>  extern AVFilter ff_vf_tonemap;
> +extern AVFilter ff_vf_tonemap_opencl;
>  extern AVFilter ff_vf_transpose;
>  extern AVFilter ff_vf_trim;
>  extern AVFilter ff_vf_unpremultiply;
> diff --git a/libavfilter/colorspace_basic.c b/libavfilter/colorspace_basic.c
> new file mode 100644
> index 0000000..93f9f08
> --- /dev/null
> +++ b/libavfilter/colorspace_basic.c
> @@ -0,0 +1,89 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "colorspace_basic.h"
> +
> +
> +void invert_matrix3x3(const double in[3][3], double out[3][3])
> +{
> +    double m00 = in[0][0], m01 = in[0][1], m02 = in[0][2],
> +           m10 = in[1][0], m11 = in[1][1], m12 = in[1][2],
> +           m20 = in[2][0], m21 = in[2][1], m22 = in[2][2];
> +    int i, j;
> +    double det;
> +
> +    out[0][0] =  (m11 * m22 - m21 * m12);
> +    out[0][1] = -(m01 * m22 - m21 * m02);
> +    out[0][2] =  (m01 * m12 - m11 * m02);
> +    out[1][0] = -(m10 * m22 - m20 * m12);
> +    out[1][1] =  (m00 * m22 - m20 * m02);
> +    out[1][2] = -(m00 * m12 - m10 * m02);
> +    out[2][0] =  (m10 * m21 - m20 * m11);
> +    out[2][1] = -(m00 * m21 - m20 * m01);
> +    out[2][2] =  (m00 * m11 - m10 * m01);
> +
> +    det = m00 * out[0][0] + m10 * out[0][1] + m20 * out[0][2];
> +    det = 1.0 / det;
> +
> +    for (i = 0; i < 3; i++) {
> +        for (j = 0; j < 3; j++)
> +            out[i][j] *= det;
> +    }
> +}
> +
> +void mul3x3(double dst[3][3], const double src1[3][3], const double src2[3][3])
> +{
> +    int m, n;
> +
> +    for (m = 0; m < 3; m++)
> +        for (n = 0; n < 3; n++)
> +            dst[m][n] = src2[m][0] * src1[0][n] +
> +                        src2[m][1] * src1[1][n] +
> +                        src2[m][2] * src1[2][n];
> +}
> +/*
> + * see e.g. http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
> + */
> +void fill_rgb2xyz_table(const struct ColorPrimaries *coeffs,
> +                        const struct WhitePoint *wp,
> +                        double rgb2xyz[3][3])
> +{
> +    double i[3][3], sr, sg, sb, zw;
> +
> +    rgb2xyz[0][0] = coeffs->xr / coeffs->yr;
> +    rgb2xyz[0][1] = coeffs->xg / coeffs->yg;
> +    rgb2xyz[0][2] = coeffs->xb / coeffs->yb;
> +    rgb2xyz[1][0] = rgb2xyz[1][1] = rgb2xyz[1][2] = 1.0;
> +    rgb2xyz[2][0] = (1.0 - coeffs->xr - coeffs->yr) / coeffs->yr;
> +    rgb2xyz[2][1] = (1.0 - coeffs->xg - coeffs->yg) / coeffs->yg;
> +    rgb2xyz[2][2] = (1.0 - coeffs->xb - coeffs->yb) / coeffs->yb;
> +    invert_matrix3x3(rgb2xyz, i);
> +    zw = 1.0 - wp->xw - wp->yw;
> +    sr = i[0][0] * wp->xw + i[0][1] * wp->yw + i[0][2] * zw;
> +    sg = i[1][0] * wp->xw + i[1][1] * wp->yw + i[1][2] * zw;
> +    sb = i[2][0] * wp->xw + i[2][1] * wp->yw + i[2][2] * zw;
> +    rgb2xyz[0][0] *= sr;
> +    rgb2xyz[0][1] *= sg;
> +    rgb2xyz[0][2] *= sb;
> +    rgb2xyz[1][0] *= sr;
> +    rgb2xyz[1][1] *= sg;
> +    rgb2xyz[1][2] *= sb;
> +    rgb2xyz[2][0] *= sr;
> +    rgb2xyz[2][1] *= sg;
> +    rgb2xyz[2][2] *= sb;
> +}
> diff --git a/libavfilter/colorspace_basic.h b/libavfilter/colorspace_basic.h
> new file mode 100644
> index 0000000..5647ca6
> --- /dev/null
> +++ b/libavfilter/colorspace_basic.h
> @@ -0,0 +1,40 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVFILTER_COLORSPACE_BASIC_H
> +#define AVFILTER_COLORSPACE_BASIC_H
> +
> +#include "libavutil/common.h"
> +
> +struct LumaCoefficients {
> +    double cr, cg, cb;
> +};
> +
> +struct ColorPrimaries {
> +    double xr, yr, xg, yg, xb, yb;
> +};
> +
> +struct WhitePoint {
> +    double xw, yw;
> +};
> +
> +void invert_matrix3x3(const double in[3][3], double out[3][3]);
> +void mul3x3(double dst[3][3], const double src1[3][3], const double src2[3][3]);
> +void fill_rgb2xyz_table(const struct ColorPrimaries *coeffs,
> +                        const struct WhitePoint *wp, double rgb2xyz[3][3]);
> +#endif
> diff --git a/libavfilter/opencl/colorspace_basic.cl b/libavfilter/opencl/colorspace_basic.cl
> new file mode 100644
> index 0000000..ffd98c2
> --- /dev/null
> +++ b/libavfilter/opencl/colorspace_basic.cl
> @@ -0,0 +1,179 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define ST2084_MAX_LUMINANCE 10000.0f
> +#define REFERENCE_WHITE 100.0f
> +constant const float ST2084_M1 = 0.1593017578125f;
> +constant const float ST2084_M2 = 78.84375f;
> +constant const float ST2084_C1 = 0.8359375f;
> +constant const float ST2084_C2 = 18.8515625f;
> +constant const float ST2084_C3 = 18.6875f;
> +
> +// TODO Move these colorspace matrix to .cpp files
what's .cpp files? is it porting from some cpp file?
> +__constant float yuv2rgb_bt2020[] = {
> +    1.0f, 0.0f, 1.4746f,
> +    1.0f, -0.16455f, -0.57135f,
> +    1.0f, 1.8814f, 0.0f
> +};
> +
> +__constant float yuv2rgb_bt709[] = {
> +    1.0f, 0.0f, 1.5748f,
> +    1.0f, -0.18732f, -0.46812f,
> +    1.0f, 1.8556f, 0.0f
> +};
> +
> +__constant float rgb2yuv_bt709[] = {
> +    0.2126f, 0.7152f, 0.0722f,
> +    -0.11457f, -0.38543f, 0.5f,
> +    0.5f, -0.45415f, -0.04585f
> +};
> +
> +__constant float rgb2yuv_bt2020[] ={
> +    0.2627f, 0.678f, 0.0593f,
> +    -0.1396f, -0.36037f, 0.5f,
> +    0.5f, -0.4598f, -0.0402f,
> +};
> +
> +
> +float get_luma_dst(float3 c) {
> +    return luma_dst.x * c.x + luma_dst.y * c.y + luma_dst.z * c.z;
> +}
> +
> +float get_luma_src(float3 c) {
> +    return luma_src.x * c.x + luma_src.y * c.y + luma_src.z * c.z;
> +}
> +
> +float eotf_st2084(float x) {
> +    float p = pow(x, 1.0f / ST2084_M2);
> +    float a = max(p -ST2084_C1, 0.0f);
> +    float b = max(ST2084_C2 - ST2084_C3 * p, 1e-6f);
> +    float c  = pow(a / b, 1.0f / ST2084_M1);
> +    return x > 0.0f ? c * ST2084_MAX_LUMINANCE / REFERENCE_WHITE : 0.0f;
> +}
> +
> +__constant const float HLG_A = 0.17883277f;
> +__constant const float HLG_B = 0.28466892f;
> +__constant const float HLG_C = 0.55991073f;
> +
> +// linearizer for HLG
> +float inverse_oetf_hlg(float x) {
> +    float a = 4.0f * x * x;
> +    float b = exp((x - HLG_C) / HLG_A) + HLG_B;
> +    return x < 0.5f ? a : b;
> +}
> +
> +// delinearizer for HLG
> +float oetf_hlg(float x) {
> +    float a = 0.5f * sqrt(x);
> +    float b = HLG_A * log(x - HLG_B) + HLG_C;
> +    return x <= 1.0f ? a : b;
> +}
> +
> +float3 ootf_hlg(float3 c) {
> +    float luma = get_luma_src(c);
> +    // assume a reference display with 1000 nits peak
> +    float factor = 1000.0f / REFERENCE_WHITE * pow(luma, 0.2f) / pow(12.0f, 1.2f);
> +    return c * factor;
> +}
> +
> +float3 inverse_ootf_hlg(float3 c) {
> +    // assume a reference display with 1000 nits peak
> +    c *=  pow(12.0f, 1.2f) / (1000.0f / REFERENCE_WHITE);
> +    c /= pow(get_luma_dst(c), 0.2f / 1.2f);
> +    return c;
> +}
> +
> +float inverse_eotf_bt1886(float c) {
> +    return c < 0.0f ? 0.0f : pow(c, 1.0f / 2.4f);
> +}
> +
> +float oetf_bt709(float c) {
> +    c = c < 0.0f ? 0.0f : c;
> +    float r1 = 4.5f * c;
> +    float r2 = 1.099f * pow(c, 0.45f) - 0.099f;
> +    return c < 0.018f ? r1 : r2;
> +}
> +float inverse_oetf_bt709(float c) {
> +    float r1 = c / 4.5f;
> +    float r2 = pow((c + 0.099f) / 1.099f, 1.0f / 0.45f);
> +    return c < 0.081f ? r1 : r2;
> +}
> +
> +float3 yuv2rgb(float y, float u, float v) {
> +#ifdef FULL_RANGE_IN
> +    u -= 0.5f; v -= 0.5f;
> +#else
> +    y = (y * 255.0f -  16.0f) / 219.0f;
> +    u = (u * 255.0f - 128.0f) / 224.0f;
> +    v = (v * 255.0f - 128.0f) / 224.0f;
> +#endif
> +    float r = y * rgb_matrix[0] + u * rgb_matrix[1] + v * rgb_matrix[2];
> +    float g = y * rgb_matrix[3] + u * rgb_matrix[4] + v * rgb_matrix[5];
> +    float b = y * rgb_matrix[6] + u * rgb_matrix[7] + v * rgb_matrix[8];
> +    return (float3)(r, g, b);
> +}
> +
> +float3 yuv2lrgb(float3 yuv) {
> +    float3 rgb = yuv2rgb(yuv.x, yuv.y, yuv.z);
> +    float r = linearize(rgb.x);
> +    float g = linearize(rgb.y);
> +    float b = linearize(rgb.z);
> +    return (float3)(r, g, b);
> +}
> +
> +float3 rgb2yuv(float r, float g, float b) {
> +    float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2];
> +    float u = r*yuv_matrix[3] + g*yuv_matrix[4] + b*yuv_matrix[5];
> +    float v = r*yuv_matrix[6] + g*yuv_matrix[7] + b*yuv_matrix[8];
> +#ifdef FULL_RANGE_OUT
> +    u += 0.5f; v += 0.5f;
> +#else
> +    y = (219.0f * y + 16.0f) / 255.0f;
> +    u = (224.0f * u + 128.0f) / 255.0f;
> +    v = (224.0f * v + 128.0f) / 255.0f;
> +#endif
> +    return (float3)(y, u, v);
> +}
> +
> +float3 lrgb2yuv(float3 c) {
> +    float r = delinearize(c.x);
> +    float g = delinearize(c.y);
> +    float b = delinearize(c.z);
> +
> +    return rgb2yuv(r, g, b);
> +}
> +
> +float3 lrgb2lrgb(float3 c) {
> +#ifdef RGB2RGB_PASSTHROUGH
> +    return c;
> +#else
> +    float r = c.x, g = c.y, b = c.z;
> +    float rr = rgb2rgb[0] * r + rgb2rgb[1] * g + rgb2rgb[2] * b;
> +    float gg = rgb2rgb[3] * r + rgb2rgb[4] * g + rgb2rgb[5] * b;
> +    float bb = rgb2rgb[6] * r + rgb2rgb[7] * g + rgb2rgb[8] * b;
> +    return (float3)(rr, gg, bb);
> +#endif
> +}
> +
> +float3 ootf(float3 c) {
> +    return ootf_impl(c);
> +}
> +
> +float3 inverse_ootf(float3 c) {
> +    return inverse_ootf_impl(c);
> +}
> diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl
> new file mode 100644
> index 0000000..03cf3e2
> --- /dev/null
> +++ b/libavfilter/opencl/tonemap.cl
> @@ -0,0 +1,258 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define REFERENCE_WHITE 100.0f
> +extern float3 lrgb2yuv(float3);
> +extern float3 yuv2lrgb(float3);
> +extern float3 lrgb2lrgb(float3);
> +extern float get_luma_src(float3);
> +extern float get_luma_dst(float3);
> +extern float3 ootf(float3);
> +extern float3 inverse_ootf(float3);
> +struct detection_result {
> +    float peak;
> +    float average;
> +};
> +
> +float hable_f(float in) {
> +    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
> +    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
> +}
> +
> +float direct(float s, float peak) {
> +    return s;
> +}
> +
> +float linear(float s, float peak) {
> +    return s * tone_param / peak;
> +}
> +
> +float gamma(float s, float peak) {
> +    float p = s > 0.05f ? s /peak : 0.05f / peak;
> +    float v = pow(p, 1.0f / tone_param);
> +    return s > 0.05f ? v : (s * v /0.05f);
> +}
> +
> +float clip(float s, float peak) {
> +    return clamp(s * tone_param, 0.0f, 1.0f);
> +}
> +
> +float reinhard(float s, float peak) {
> +    return s / (s + tone_param) * (peak + tone_param) / peak;
> +}
> +
> +float hable(float s, float peak) {
> +    return hable_f(s)/hable_f(peak);
> +}
> +
> +float mobius(float s, float peak) {
> +    float j = tone_param;
> +    float a, b;
> +
> +    if (s <= j)
> +        return s;
> +
> +    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
> +    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);
> +
> +    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);
> +}
> +
> +// detect peak/average signal of a frame, the algorithm was ported from:
> +// libplacebo (https://github.com/haasn/libplacebo)
> +struct detection_result
> +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
> +            float signal, float peak) {
> +    global uint *avg_buf = util_buf;
> +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
> +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
> +    global uint *max_total_p = counter_wg_p + 1;
> +    global uint *avg_total_p = max_total_p + 1;
> +    global uint *frame_idx_p = avg_total_p + 1;
> +    global uint *scene_frame_num_p = frame_idx_p + 1;
> +
> +    uint frame_idx = *frame_idx_p;
> +    uint scene_frame_num = *scene_frame_num_p;
> +
> +    size_t lidx = get_local_id(0);
> +    size_t lidy = get_local_id(1);
> +    size_t lsizex = get_local_size(0);
> +    size_t lsizey = get_local_size(1);
> +    uint num_wg = get_num_groups(0) * get_num_groups(1);
> +    size_t group_idx = get_group_id(0);
> +    size_t group_idy = get_group_id(1);
> +    struct detection_result r = {peak, sdr_avg};
> +    *sum_wg = 0;
> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update workgroup sum
> +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));
> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update frame peak/avg using work-group-average.
> +    if (lidx == 0 && lidy == 0) {
> +        uint avg_wg = *sum_wg / (lsizex * lsizey);
> +        atomic_max(&peak_buf[frame_idx], avg_wg);
> +        atomic_add(&avg_buf[frame_idx], avg_wg);
> +    }
> +
> +    if (scene_frame_num > 0) {
> +        float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        r.peak = max(1.0f, peak);
> +        r.average = max(0.25f, avg);
> +    }
> +
> +    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) {
> +        *counter_wg_p = 0;
> +        avg_buf[frame_idx] /= num_wg;
> +
> +        if (scene_threshold > 0.0f) {
> +            uint cur_max = peak_buf[frame_idx];
> +            uint cur_avg = avg_buf[frame_idx];
> +            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;
> +
> +            if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) {
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  avg_buf[i] = 0;
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  peak_buf[i] = 0;
> +                *avg_total_p = *max_total_p = 0;
> +                *scene_frame_num_p = 0;
> +                avg_buf[frame_idx] = cur_avg;
> +                peak_buf[frame_idx] = cur_max;
> +            }
> +        }
> +        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);
> +        // add current frame, subtract next frame
> +        *max_total_p += peak_buf[frame_idx] - peak_buf[next];
> +        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];
> +        // reset next frame
> +        peak_buf[next] = avg_buf[next] = 0;
> +        *frame_idx_p = next;
> +        *scene_frame_num_p = min(*scene_frame_num_p + 1, (uint)DETECTION_FRAMES);
> +    }
> +    return r;
> +}
> +
> +__constant const float desat_param = 0.5f;
> +__constant const float dst_peak    = 1.0f;
> +
> +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
> +    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);
> +    // de-saturate
> +    if (desat_param > 0.0f) {
> +        float luma = get_luma_dst(rgb);
> +        float base = 0.18f * dst_peak;
> +        float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);
> +        coeff = native_powr(coeff, 10.0f / desat_param);
> +        rgb = mix(rgb, (float3)luma, (float3)coeff);
> +        sig = mix(sig, luma, coeff);
> +    }
> +
> +    float sig_old = sig;
> +    float slope = min(1.0f, sdr_avg / average);
> +    sig *= slope;
> +    peak *= slope;
> +
> +    sig = TONE_FUNC(sig, peak);
> +    rgb *= (sig/sig_old);
> +    return rgb;
> +}
> +// map from source space YUV to destination space RGB
> +float3 map_to_dst_space_from_yuv(float3 yuv) {
> +    float3 c = yuv2lrgb(yuv);
> +    c = ootf(c);
> +    c = lrgb2lrgb(c);
> +    return c;
> +}
> +
> +// convert from rgb to yuv, with possible inverse-ootf
> +float3 convert_to_yuv(float3 c) {
> +    c = inverse_ootf(c);
> +    return lrgb2yuv(c);
> +}
> +
> +__kernel void tonemap(__write_only image2d_t dst1,
> +                      __write_only image2d_t dst2,
> +                      __read_only  image2d_t src1,
> +                      __read_only  image2d_t src2,
> +#ifdef THIRD_PLANE
> +                      __write_only image2d_t dst3,
> +                      __read_only  image2d_t src3,
> +#endif
> +                      global uint *util_buf,
> +                      float peak
> +                      )
> +{
> +    __local uint sum_wg;
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int xi = get_global_id(0);
> +    int yi = get_global_id(1);
> +    // each work item process four pixels
> +    int x = 2 * xi;
> +    int y = 2 * yi;
> +
> +    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;
> +    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;
> +    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;
> +    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;
> +#ifdef THIRD_PLANE
> +    float u = read_imagef(src2, sampler, (int2)(xi, yi)).x;
> +    float v = read_imagef(src3, sampler, (int2)(xi, yi)).x;
> +    float2 uv = (float2)(u, v);
> +#else
> +    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;
> +#endif
> +
> +    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y));
> +    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y));
> +    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y));
> +    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y));
> +
> +    float sig0 = max(c0.x, max(c0.y, c0.z));
> +    float sig1 = max(c1.x, max(c1.y, c1.z));
> +    float sig2 = max(c2.x, max(c2.y, c2.z));
> +    float sig3 = max(c3.x, max(c3.y, c3.z));
> +    float sig = max(sig0, max(sig1, max(sig2, sig3)));
> +
> +    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);
> +
> +    float3 c0_old = c0, c1_old = c1, c2_old = c2;
> +    c0 = map_one_pixel_rgb(c0, r.peak, r.average);
> +    c1 = map_one_pixel_rgb(c1, r.peak, r.average);
> +    c2 = map_one_pixel_rgb(c2, r.peak, r.average);
> +    c3 = map_one_pixel_rgb(c3, r.peak, r.average);
> +
> +    float3 yuv0 = convert_to_yuv(c0);
> +    float3 yuv1 = convert_to_yuv(c1);
> +    float3 yuv2 = convert_to_yuv(c2);
> +    float3 yuv3 = convert_to_yuv(c3);
> +
> +    write_imagef(dst1, (int2)(x, y), (float4)(yuv0.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y), (float4)(yuv1.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x, y+1), (float4)(yuv2.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y+1), (float4)(yuv3.x, 0.0f, 0.0f, 1.0f));
> +#ifdef THIRD_PLANE
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst3, (int2)(xi, yi), (float4)(yuv0.z, 0.0f, 0.0f, 1.0f));
> +#else
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, yuv0.z, 0.0f, 1.0f));
> +#endif
> +}
> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 4bb9969..c5b3f37 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -21,7 +21,9 @@
>
>  extern const char *ff_opencl_source_avgblur;
>  extern const char *ff_opencl_source_convolution;
> +extern const char *ff_opencl_source_colorspace_basic;
Need to follow alphabetizing convention
>  extern const char *ff_opencl_source_overlay;
> +extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_unsharp;
>
>  #endif /* AVFILTER_OPENCL_SOURCE_H */
> diff --git a/libavfilter/vf_tonemap_opencl.c b/libavfilter/vf_tonemap_opencl.c
> new file mode 100644
> index 0000000..e2311e0
> --- /dev/null
> +++ b/libavfilter/vf_tonemap_opencl.c
> @@ -0,0 +1,560 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include <float.h>
> +
> +#include "libavutil/bprint.h"
> +#include "libavutil/common.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/mastering_display_metadata.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +#include "colorspace_basic.h"
> +
> +#define DETECTION_FRAMES 63
> +#define REFERENCE_WHITE 100.0f
> +
> +enum TonemapAlgorithm {
> +    TONEMAP_NONE,
> +    TONEMAP_LINEAR,
> +    TONEMAP_GAMMA,
> +    TONEMAP_CLIP,
> +    TONEMAP_REINHARD,
> +    TONEMAP_HABLE,
> +    TONEMAP_MOBIUS,
> +    TONEMAP_MAX,
> +};
> +
> +typedef struct TonemapOpenCLContext {
> +    OpenCLFilterContext ocf;
> +
> +    enum AVColorSpace colorspace, colorspace_in, colorspace_out;
> +    enum AVColorTransferCharacteristic trc, trc_in, trc_out;
> +    enum AVColorPrimaries primaries, primaries_in, primaries_out;
> +    enum AVColorRange range, range_in, range_out;
> +
> +    enum TonemapAlgorithm tonemap;
> +    enum AVPixelFormat    format;
> +    double                peak;
> +    double                param;
> +    int                   initialised;
> +    cl_kernel             kernel;
> +    cl_command_queue      command_queue;
> +    cl_mem                util_mem;
> +    DECLARE_ALIGNED(64, int32_t, util_buf)[2 * DETECTION_FRAMES + 7];
> +} TonemapOpenCLContext;
> +
> +const char *yuv_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "rgb2yuv_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "rgb2yuv_bt2020",
> +};
> +
> +const char *rgb_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "yuv2rgb_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "yuv2rgb_bt2020",
> +};
> +
> +const char *linearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_SMPTE2084] = "eotf_st2084",
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_oetf_hlg",
> +};
> +
> +const char *ootf_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_ARIB_STD_B67] = "ootf_hlg",
> +    [AVCOL_TRC_SMPTE2084] = "",
> +};
> +
> +const char *inverse_ootf_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_ootf_hlg",
> +    [AVCOL_TRC_SMPTE2084] = "",
> +};
> +
> +const char *delinearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_BT709]     = "inverse_eotf_bt1886",
> +    [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886",
> +};
> +
> +static const struct LumaCoefficients luma_coefficients[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709]      = { 0.2126, 0.7152, 0.0722 },
> +    [AVCOL_SPC_BT2020_NCL] = { 0.2627, 0.6780, 0.0593 },
> +};
> +
> +struct ColorPrimaries primaries_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.640, 0.330, 0.300, 0.600, 0.150, 0.060 },
> +    [AVCOL_PRI_BT2020] = { 0.708, 0.292, 0.170, 0.797, 0.131, 0.046 },
> +};
> +
> +struct WhitePoint whitepoint_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.3127, 0.3290 },
> +    [AVCOL_PRI_BT2020] = { 0.3127, 0.3290 },
> +};
> +
> +const char *tonemap_func[TONEMAP_MAX] = {
> +    [TONEMAP_NONE]     = "direct",
> +    [TONEMAP_LINEAR]   = "linear",
> +    [TONEMAP_GAMMA]    = "gamma",
> +    [TONEMAP_CLIP]     = "clip",
> +    [TONEMAP_REINHARD] = "reinhard",
> +    [TONEMAP_HABLE]    = "hable",
> +    [TONEMAP_MOBIUS]   = "mobius",
> +};
> +
> +static void get_rgb2rgb_matrix(enum AVColorPrimaries in, enum AVColorPrimaries out,
> +                               double rgb2rgb[3][3]) {
> +    double rgb2xyz[3][3], xyz2rgb[3][3];
> +
> +    fill_rgb2xyz_table(&primaries_table[out], &whitepoint_table[out], rgb2xyz);
> +    invert_matrix3x3(rgb2xyz, xyz2rgb);
> +    fill_rgb2xyz_table(&primaries_table[in], &whitepoint_table[in], rgb2xyz);
> +    mul3x3(rgb2rgb, rgb2xyz, xyz2rgb);
> +}
> +
> +#define OPENCL_SOURCE_NB 3
> +// Average light level for SDR signals. This is equal to a signal level of 0.5
> +// under a typical presentation gamma of about 2.0.
> +static const float sdr_avg = 0.25f;
> +static const float scene_threshold = 0.2f;
> +
> +static int tonemap_opencl_init(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int rgb2rgb_passthrough = 1;
> +    double rgb2rgb[3][3];
> +    struct LumaCoefficients luma_src, luma_dst;
> +    cl_int cle;
> +    int err;
> +    AVBPrint header;
> +    const char *opencl_sources[OPENCL_SOURCE_NB];
> +
> +    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
> +
> +    switch(ctx->tonemap) {
> +    case TONEMAP_GAMMA:
> +        if (isnan(ctx->param))
> +            ctx->param = 1.8f;
> +        break;
> +    case TONEMAP_REINHARD:
> +        if (!isnan(ctx->param))
> +            ctx->param = (1.0f - ctx->param) / ctx->param;
> +        break;
> +    case TONEMAP_MOBIUS:
> +        if (isnan(ctx->param))
> +            ctx->param = 0.3f;
> +        break;
> +    }
> +
> +    if (isnan(ctx->param))
> +        ctx->param = 1.0f;
> +
> +    av_bprintf(&header, "__constant const float tone_param = %.4ff;\n",
> +               ctx->param);
> +    av_bprintf(&header, "__constant const float sdr_avg = %.4ff;\n", sdr_avg);
> +    av_bprintf(&header, "__constant const float scene_threshold = %.4ff;\n",
> +               scene_threshold);
> +    av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx->tonemap]);
> +    av_bprintf(&header, "#define DETECTION_FRAMES %d\n", DETECTION_FRAMES);
> +
> +    if (ctx->primaries_out != ctx->primaries_in) {
> +        get_rgb2rgb_matrix(ctx->primaries_in, ctx->primaries_out, rgb2rgb);
> +        rgb2rgb_passthrough = 0;
> +    }
> +    if (ctx->range_in == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_IN\n");
> +
> +    if (ctx->range_out == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_OUT\n");
> +
> +    if (rgb2rgb_passthrough)
> +        av_bprintf(&header, "#define RGB2RGB_PASSTHROUGH\n");
> +    else {
> +        av_bprintf(&header, "__constant float rgb2rgb[9] = {\n");
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[0][0], rgb2rgb[0][1], rgb2rgb[0][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[1][0], rgb2rgb[1][1], rgb2rgb[1][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff};\n",
> +                   rgb2rgb[2][0], rgb2rgb[2][1], rgb2rgb[2][2]);
> +    }
> +
> +    av_bprintf(&header, "#define rgb_matrix %s\n",
> +               rgb_coff[ctx->colorspace_in]);
> +    av_bprintf(&header, "#define yuv_matrix %s\n",
> +               yuv_coff[ctx->colorspace_out]);
> +
> +    luma_src = luma_coefficients[ctx->colorspace_in];
> +    luma_dst = luma_coefficients[ctx->colorspace_out];
> +    av_bprintf(&header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_src.cr, luma_src.cg, luma_src.cb);
> +    av_bprintf(&header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_dst.cr, luma_dst.cg, luma_dst.cb);
> +
> +    av_bprintf(&header, "#define linearize %s\n", linearize_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define delinearize %s\n",
> +               delinearize_funcs[ctx->trc_out]);
> +
> +    av_bprintf(&header, "#define ootf_impl %s\n", ootf_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define inverse_ootf_impl %s\n",
> +               inverse_ootf_funcs[ctx->trc_in]);
> +
> +
> +    opencl_sources[0] = header.str;
> +    opencl_sources[1] = ff_opencl_source_tonemap;
> +    opencl_sources[2] = ff_opencl_source_colorspace_basic;
> +    err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
> +
> +    av_bprint_finalize(&header, NULL);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    if (!ctx->command_queue) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
> +               "command queue: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->kernel = clCreateKernel(ctx->ocf.program, "tonemap", &cle);
> +    if (!ctx->kernel) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->util_mem = clCreateBuffer(ctx->ocf.hwctx->context,
> +                                   CL_MEM_USE_HOST_PTR |
> +                                   CL_MEM_HOST_NO_ACCESS,
> +                                   sizeof(ctx->util_buf), ctx->util_buf, &cle);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create util buffer: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->kernel)
> +        clReleaseKernel(ctx->kernel);
> +    return err;
> +}
> +
> +static int tonemap_opencl_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *avctx = outlink->src;
> +    TonemapOpenCLContext *s = avctx->priv;
> +    int ret;
> +    s->ocf.output_format = s->format == AV_PIX_FMT_NONE ? AV_PIX_FMT_NV12 : s->format;
> +    ret = ff_opencl_filter_config_output(outlink);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int launch_kernel(AVFilterContext *avctx, cl_kernel kernel,
> +                         AVFrame *output, AVFrame *input, float peak) {
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int err = AVERROR(ENOSYS);
> +    size_t global_work[2];
> +    size_t local_work[2];
> +    cl_int cle;
> +
> +    cle = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 4, sizeof(cl_mem), &ctx->util_mem);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 5, sizeof(cl_float), &peak);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "peak luma: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    local_work[0]  = 16;
> +    local_work[1]  = 16;
> +    // Note the work size based on uv plane, as we process a 2x2 quad in one workitem
> +    err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
> +                                                1, 16);
> +    if (err < 0)
> +        return err;
> +
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
> +                                 global_work, local_work,
> +                                 0, NULL, NULL);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to enqueue kernel: %d.\n",
> +               cle);
> +        return AVERROR(EIO);
> +    }
> +    return 0;
> +}
> +
> +static double determine_signal_peak(AVFrame *in)
> +{
> +    AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
> +    double peak = 0;
> +
> +    if (sd) {
> +        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;
> +        peak = clm->MaxCLL / REFERENCE_WHITE;
> +    }
> +
> +    sd = av_frame_get_side_data(in, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
> +    if (!peak && sd) {
> +        AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
> +        if (metadata->has_luminance)
> +            peak = av_q2d(metadata->max_luminance) / REFERENCE_WHITE;
> +    }
> +
> +    // if not SMPTE2084, we would assume HLG
> +    if (!peak)
> +        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;
> +
> +    return peak;
> +}
> +
> +static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    cl_int cle;
> +    int err;
> +    double peak = ctx->peak;
> +
> +    AVHWFramesContext *input_frames_ctx =
> +        (AVHWFramesContext*)input->hw_frames_ctx->data;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!peak)
> +        peak = determine_signal_peak(input);
> +
> +    if (ctx->trc != -1)
> +        output->color_trc = ctx->trc;
> +    if (ctx->primaries != -1)
> +        output->color_primaries = ctx->primaries;
> +    if (ctx->colorspace != -1)
> +        output->colorspace = ctx->colorspace;
> +    if (ctx->range != -1)
> +        output->color_range = ctx->range;
> +
> +    ctx->trc_in = input->color_trc;
> +    ctx->trc_out = output->color_trc;
> +    ctx->colorspace_in = input->colorspace;
> +    ctx->colorspace_out = output->colorspace;
> +    ctx->primaries_in = input->color_primaries;
> +    ctx->primaries_out = output->color_primaries;
> +    ctx->range_in = input->color_range;
> +    ctx->range_out = output->color_range;
> +
> +    if (!ctx->initialised) {
> +        err = tonemap_opencl_init(avctx);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    switch(input_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_P010:
> +        err = launch_kernel(avctx, ctx->kernel, output, input, peak);
> +        if (err < 0) goto fail;
> +        break;
> +    default:
> +        av_log(ctx, AV_LOG_ERROR, "unsupported format in tonemap_opencl.\n");
> +        err = AVERROR(ENOSYS);
> +        goto fail;
> +    }
> +
> +    cle = clFinish(ctx->command_queue);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",
> +               cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    av_frame_free(&input);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->kernel) {
> +        cle = clReleaseKernel(ctx->kernel);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "kernel: %d.\n", cle);
> +    }
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}
> +
> +#define OFFSET(x) offsetof(TonemapOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption tonemap_opencl_options[] = {
> +    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_NONE}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
> +    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
> +    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
> +    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
> +    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
> +    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
> +    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
> +    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
> +    { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
> +    { "t",        "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},         0, 0, FLAGS, "transfer" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},     0, 0, FLAGS, "transfer" },
> +    { "matrix", "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    { "m",      "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },
> +    { "primaries", "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    { "p",         "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},         0, 0, FLAGS, "primaries" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},        0, 0, FLAGS, "primaries" },
> +    { "range",         "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    { "r",             "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    {     "tv",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "pc",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    {     "limited",       0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "full",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    { "format",    "output pixel format", OFFSET(format), AV_OPT_TYPE_INT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_GBRAP12LE, FLAGS, "fmt" },
> +    {     "nv12",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_NV12},          0, 0, FLAGS, "fmt" },
> +    {     "p010",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_P010},          0, 0, FLAGS, "fmt" },
> +    { "peak",      "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
> +    { "param",     "tonemap parameter",   OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(tonemap_opencl);
> +
> +static const AVFilterPad tonemap_opencl_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = &tonemap_opencl_filter_frame,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad tonemap_opencl_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &tonemap_opencl_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_tonemap_opencl = {
> +    .name           = "tonemap_opencl",
> +    .description    = NULL_IF_CONFIG_SMALL("perform HDR to SDR conversion with tonemapping"),
> +    .priv_size      = sizeof(TonemapOpenCLContext),
> +    .priv_class     = &tonemap_opencl_class,
> +    .init           = &ff_opencl_filter_init,
> +    .uninit         = &tonemap_opencl_uninit,
> +    .query_formats  = &ff_opencl_filter_query_formats,
> +    .inputs         = tonemap_opencl_inputs,
> +    .outputs        = tonemap_opencl_outputs,
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> --
> 2.7.4
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Mark Thompson May 22, 2018, 12:18 a.m. UTC | #2
On 21/05/18 07:50, Ruiling Song wrote:
> This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
> 
> An example command to use this filter with vaapi codecs:
> FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
> opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
> vaapi -i INPUT -filter_hw_device ocl -filter_complex \
> '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
> [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT
> 
> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> ---

I assume you're testing with Beignet for this sort of mapping to work?  I tried it with Beignet on Coffee Lake with 10-bit videos and it looks sensible, though it is rather hard to tell whether it is in some sense "correct".

Given a non-P010 input video it fails with build errors when compling the kernels:

[Parsed_tonemap_opencl_1 @ 0x55b700e51540] Failed to build program: -11.
[Parsed_tonemap_opencl_1 @ 0x55b700e51540] Build log:
/home/mrt/video/ffmpeg/opencl/libavfilter/opencl/colorspace_basic.cl:125:19: error: use of undeclared identifier 'null'; did you mean 'all'?
stringInput.cl:7:21: note: expanded from macro 'rgb_matrix'

That case should probably be caught earlier and rejected with a clear message.


On Mali:

$ ./ffmpeg_g -v 55 -y -i ~/test/The\ World\ in\ HDR.mkv -init_hw_device opencl -filter_hw_device opencl0 -an -vf 'format=p010,hwupload,tonemap_opencl=t=bt2020:tonemap=linear:format=p010,hwdownload,format=p010' -c:v libx264 out.mp4
...
[tonemap_opencl @ 0x8201d7c0] Filter input: opencl, 3840x2160 (0).
[Parsed_tonemap_opencl_2 @ 0x8201d760] Failed to enqueue kernel: -5.

That's an RK3288 with a Mali T760, clinfo: <https://0x0.st/se5r.txt>, full log: <https://0x0.st/se5s.log>.

(The Rockchip hardware decoder can do H.265 Main 10, but the output format isn't P010 so it's easier to use VP9 here.)


Some more thoughts below, I haven't read through all of it carefully.

Thanks,

- Mark


>  configure                              |   1 +
>  libavfilter/Makefile                   |   2 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/colorspace_basic.c         |  89 ++++++
>  libavfilter/colorspace_basic.h         |  40 +++
>  libavfilter/opencl/colorspace_basic.cl | 179 +++++++++++
>  libavfilter/opencl/tonemap.cl          | 258 +++++++++++++++
>  libavfilter/opencl_source.h            |   2 +
>  libavfilter/vf_tonemap_opencl.c        | 560 +++++++++++++++++++++++++++++++++
>  9 files changed, 1132 insertions(+)
>  create mode 100644 libavfilter/colorspace_basic.c
>  create mode 100644 libavfilter/colorspace_basic.h
>  create mode 100644 libavfilter/opencl/colorspace_basic.cl
>  create mode 100644 libavfilter/opencl/tonemap.cl
>  create mode 100644 libavfilter/vf_tonemap_opencl.c
> 
> ...
> diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl
> new file mode 100644
> index 0000000..03cf3e2
> --- /dev/null
> +++ b/libavfilter/opencl/tonemap.cl
> @@ -0,0 +1,258 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define REFERENCE_WHITE 100.0f
> +extern float3 lrgb2yuv(float3);
> +extern float3 yuv2lrgb(float3);
> +extern float3 lrgb2lrgb(float3);
> +extern float get_luma_src(float3);
> +extern float get_luma_dst(float3);
> +extern float3 ootf(float3);
> +extern float3 inverse_ootf(float3);
> +struct detection_result {
> +    float peak;
> +    float average;
> +};
> +
> +float hable_f(float in) {
> +    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
> +    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
> +}
> +
> +float direct(float s, float peak) {
> +    return s;
> +}
> +
> +float linear(float s, float peak) {
> +    return s * tone_param / peak;
> +}
> +
> +float gamma(float s, float peak) {
> +    float p = s > 0.05f ? s /peak : 0.05f / peak;
> +    float v = pow(p, 1.0f / tone_param);
> +    return s > 0.05f ? v : (s * v /0.05f);
> +}
> +
> +float clip(float s, float peak) {
> +    return clamp(s * tone_param, 0.0f, 1.0f);
> +}
> +
> +float reinhard(float s, float peak) {
> +    return s / (s + tone_param) * (peak + tone_param) / peak;
> +}
> +
> +float hable(float s, float peak) {
> +    return hable_f(s)/hable_f(peak);
> +}
> +
> +float mobius(float s, float peak) {
> +    float j = tone_param;
> +    float a, b;
> +
> +    if (s <= j)
> +        return s;
> +
> +    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
> +    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);
> +
> +    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);
> +}
> +
> +// detect peak/average signal of a frame, the algorithm was ported from:
> +// libplacebo (https://github.com/haasn/libplacebo)
> +struct detection_result
> +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
> +            float signal, float peak) {
> +    global uint *avg_buf = util_buf;
> +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
> +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
> +    global uint *max_total_p = counter_wg_p + 1;
> +    global uint *avg_total_p = max_total_p + 1;
> +    global uint *frame_idx_p = avg_total_p + 1;
> +    global uint *scene_frame_num_p = frame_idx_p + 1;
> +
> +    uint frame_idx = *frame_idx_p;
> +    uint scene_frame_num = *scene_frame_num_p;
> +
> +    size_t lidx = get_local_id(0);
> +    size_t lidy = get_local_id(1);
> +    size_t lsizex = get_local_size(0);
> +    size_t lsizey = get_local_size(1);
> +    uint num_wg = get_num_groups(0) * get_num_groups(1);
> +    size_t group_idx = get_group_id(0);
> +    size_t group_idy = get_group_id(1);
> +    struct detection_result r = {peak, sdr_avg};
> +    *sum_wg = 0;

This is technically a data race - maybe set it in only the first workitem?

> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update workgroup sum
> +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));

I think the numbers you're adding together here sum to at most something like 16 * 16 * 100 * 1023?  Can you make sure this can't overflow and add a comment on that.

> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update frame peak/avg using work-group-average.
> +    if (lidx == 0 && lidy == 0) {
> +        uint avg_wg = *sum_wg / (lsizex * lsizey);
> +        atomic_max(&peak_buf[frame_idx], avg_wg);
> +        atomic_add(&avg_buf[frame_idx], avg_wg);

Similarly this one?  (width/16 * height/16 * 100 * 1023, I think, which might overflow for 8K?)

> +    }
> +
> +    if (scene_frame_num > 0) {
> +        float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        r.peak = max(1.0f, peak);
> +        r.average = max(0.25f, avg);

fmax()?  (max() is an integer function, not sure what it does to 0.25f.)

> +    }
> +
> +    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) {
> +        *counter_wg_p = 0;
> +        avg_buf[frame_idx] /= num_wg;
> +
> +        if (scene_threshold > 0.0f) {
> +            uint cur_max = peak_buf[frame_idx];
> +            uint cur_avg = avg_buf[frame_idx];
> +            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;
> +
> +            if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) {
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  avg_buf[i] = 0;
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  peak_buf[i] = 0;
> +                *avg_total_p = *max_total_p = 0;
> +                *scene_frame_num_p = 0;
> +                avg_buf[frame_idx] = cur_avg;
> +                peak_buf[frame_idx] = cur_max;
> +            }
> +        }
> +        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);
> +        // add current frame, subtract next frame
> +        *max_total_p += peak_buf[frame_idx] - peak_buf[next];
> +        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];
> +        // reset next frame
> +        peak_buf[next] = avg_buf[next] = 0;
> +        *frame_idx_p = next;
> +        *scene_frame_num_p = min(*scene_frame_num_p + 1, (uint)DETECTION_FRAMES);
> +    }
> +    return r;
> +}
> +
> +__constant const float desat_param = 0.5f;
> +__constant const float dst_peak    = 1.0f;
> +
> +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
> +    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);

More max(), also below.

> +    // de-saturate
> +    if (desat_param > 0.0f) {
> +        float luma = get_luma_dst(rgb);
> +        float base = 0.18f * dst_peak;

Magic number might want some explaination.

> +        float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);
> +        coeff = native_powr(coeff, 10.0f / desat_param);
> +        rgb = mix(rgb, (float3)luma, (float3)coeff);
> +        sig = mix(sig, luma, coeff);
> +    }
> +
> +    float sig_old = sig;
> +    float slope = min(1.0f, sdr_avg / average);
> +    sig *= slope;
> +    peak *= slope;
> +
> +    sig = TONE_FUNC(sig, peak);
> +    rgb *= (sig/sig_old);
> +    return rgb;
> +}
> +// map from source space YUV to destination space RGB
> +float3 map_to_dst_space_from_yuv(float3 yuv) {
> +    float3 c = yuv2lrgb(yuv);
> +    c = ootf(c);
> +    c = lrgb2lrgb(c);
> +    return c;
> +}
> +
> +// convert from rgb to yuv, with possible inverse-ootf
> +float3 convert_to_yuv(float3 c) {
> +    c = inverse_ootf(c);
> +    return lrgb2yuv(c);
> +}
> +
> +__kernel void tonemap(__write_only image2d_t dst1,
> +                      __write_only image2d_t dst2,
> +                      __read_only  image2d_t src1,
> +                      __read_only  image2d_t src2,
> +#ifdef THIRD_PLANE
> +                      __write_only image2d_t dst3,
> +                      __read_only  image2d_t src3,
> +#endif
> +                      global uint *util_buf,
> +                      float peak
> +                      )
> +{
> +    __local uint sum_wg;
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int xi = get_global_id(0);
> +    int yi = get_global_id(1);
> +    // each work item process four pixels
> +    int x = 2 * xi;
> +    int y = 2 * yi;
> +
> +    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;
> +    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;
> +    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;
> +    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;
> +#ifdef THIRD_PLANE
> +    float u = read_imagef(src2, sampler, (int2)(xi, yi)).x;
> +    float v = read_imagef(src3, sampler, (int2)(xi, yi)).x;
> +    float2 uv = (float2)(u, v);
> +#else
> +    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;
> +#endif
> +
> +    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y));
> +    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y));
> +    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y));
> +    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y));
> +
> +    float sig0 = max(c0.x, max(c0.y, c0.z));
> +    float sig1 = max(c1.x, max(c1.y, c1.z));
> +    float sig2 = max(c2.x, max(c2.y, c2.z));
> +    float sig3 = max(c3.x, max(c3.y, c3.z));
> +    float sig = max(sig0, max(sig1, max(sig2, sig3)));
> +
> +    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);
> +
> +    float3 c0_old = c0, c1_old = c1, c2_old = c2;
> +    c0 = map_one_pixel_rgb(c0, r.peak, r.average);
> +    c1 = map_one_pixel_rgb(c1, r.peak, r.average);
> +    c2 = map_one_pixel_rgb(c2, r.peak, r.average);
> +    c3 = map_one_pixel_rgb(c3, r.peak, r.average);
> +
> +    float3 yuv0 = convert_to_yuv(c0);
> +    float3 yuv1 = convert_to_yuv(c1);
> +    float3 yuv2 = convert_to_yuv(c2);
> +    float3 yuv3 = convert_to_yuv(c3);
> +
> +    write_imagef(dst1, (int2)(x, y), (float4)(yuv0.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y), (float4)(yuv1.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x, y+1), (float4)(yuv2.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y+1), (float4)(yuv3.x, 0.0f, 0.0f, 1.0f));
> +#ifdef THIRD_PLANE
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst3, (int2)(xi, yi), (float4)(yuv0.z, 0.0f, 0.0f, 1.0f));
> +#else
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, yuv0.z, 0.0f, 1.0f));
> +#endif
> +}
> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 4bb9969..c5b3f37 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -21,7 +21,9 @@
>  
>  extern const char *ff_opencl_source_avgblur;
>  extern const char *ff_opencl_source_convolution;
> +extern const char *ff_opencl_source_colorspace_basic;
>  extern const char *ff_opencl_source_overlay;
> +extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_unsharp;
>  
>  #endif /* AVFILTER_OPENCL_SOURCE_H */
> diff --git a/libavfilter/vf_tonemap_opencl.c b/libavfilter/vf_tonemap_opencl.c
> new file mode 100644
> index 0000000..e2311e0
> --- /dev/null
> +++ b/libavfilter/vf_tonemap_opencl.c
> @@ -0,0 +1,560 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include <float.h>
> +
> +#include "libavutil/bprint.h"
> +#include "libavutil/common.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/mastering_display_metadata.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +#include "colorspace_basic.h"
> +
> +#define DETECTION_FRAMES 63
> +#define REFERENCE_WHITE 100.0f
> +
> +enum TonemapAlgorithm {
> +    TONEMAP_NONE,
> +    TONEMAP_LINEAR,
> +    TONEMAP_GAMMA,
> +    TONEMAP_CLIP,
> +    TONEMAP_REINHARD,
> +    TONEMAP_HABLE,
> +    TONEMAP_MOBIUS,
> +    TONEMAP_MAX,
> +};
> +
> +typedef struct TonemapOpenCLContext {
> +    OpenCLFilterContext ocf;
> +
> +    enum AVColorSpace colorspace, colorspace_in, colorspace_out;
> +    enum AVColorTransferCharacteristic trc, trc_in, trc_out;
> +    enum AVColorPrimaries primaries, primaries_in, primaries_out;
> +    enum AVColorRange range, range_in, range_out;
> +
> +    enum TonemapAlgorithm tonemap;
> +    enum AVPixelFormat    format;
> +    double                peak;
> +    double                param;
> +    int                   initialised;
> +    cl_kernel             kernel;
> +    cl_command_queue      command_queue;
> +    cl_mem                util_mem;
> +    DECLARE_ALIGNED(64, int32_t, util_buf)[2 * DETECTION_FRAMES + 7];
> +} TonemapOpenCLContext;
> +
> +const char *yuv_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "rgb2yuv_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "rgb2yuv_bt2020",
> +};
> +
> +const char *rgb_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "yuv2rgb_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "yuv2rgb_bt2020",
> +};
> +
> +const char *linearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_SMPTE2084] = "eotf_st2084",
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_oetf_hlg",
> +};
> +
> +const char *ootf_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_ARIB_STD_B67] = "ootf_hlg",
> +    [AVCOL_TRC_SMPTE2084] = "",
> +};
> +
> +const char *inverse_ootf_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_ootf_hlg",
> +    [AVCOL_TRC_SMPTE2084] = "",
> +};
> +
> +const char *delinearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_BT709]     = "inverse_eotf_bt1886",
> +    [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886",
> +};
> +
> +static const struct LumaCoefficients luma_coefficients[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709]      = { 0.2126, 0.7152, 0.0722 },
> +    [AVCOL_SPC_BT2020_NCL] = { 0.2627, 0.6780, 0.0593 },
> +};
> +
> +struct ColorPrimaries primaries_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.640, 0.330, 0.300, 0.600, 0.150, 0.060 },
> +    [AVCOL_PRI_BT2020] = { 0.708, 0.292, 0.170, 0.797, 0.131, 0.046 },
> +};
> +
> +struct WhitePoint whitepoint_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.3127, 0.3290 },
> +    [AVCOL_PRI_BT2020] = { 0.3127, 0.3290 },
> +};
> +
> +const char *tonemap_func[TONEMAP_MAX] = {
> +    [TONEMAP_NONE]     = "direct",
> +    [TONEMAP_LINEAR]   = "linear",
> +    [TONEMAP_GAMMA]    = "gamma",
> +    [TONEMAP_CLIP]     = "clip",
> +    [TONEMAP_REINHARD] = "reinhard",
> +    [TONEMAP_HABLE]    = "hable",
> +    [TONEMAP_MOBIUS]   = "mobius",
> +};
> +
> +static void get_rgb2rgb_matrix(enum AVColorPrimaries in, enum AVColorPrimaries out,
> +                               double rgb2rgb[3][3]) {
> +    double rgb2xyz[3][3], xyz2rgb[3][3];
> +
> +    fill_rgb2xyz_table(&primaries_table[out], &whitepoint_table[out], rgb2xyz);
> +    invert_matrix3x3(rgb2xyz, xyz2rgb);
> +    fill_rgb2xyz_table(&primaries_table[in], &whitepoint_table[in], rgb2xyz);
> +    mul3x3(rgb2rgb, rgb2xyz, xyz2rgb);
> +}
> +
> +#define OPENCL_SOURCE_NB 3
> +// Average light level for SDR signals. This is equal to a signal level of 0.5
> +// under a typical presentation gamma of about 2.0.
> +static const float sdr_avg = 0.25f;
> +static const float scene_threshold = 0.2f;
> +
> +static int tonemap_opencl_init(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int rgb2rgb_passthrough = 1;
> +    double rgb2rgb[3][3];
> +    struct LumaCoefficients luma_src, luma_dst;
> +    cl_int cle;
> +    int err;
> +    AVBPrint header;
> +    const char *opencl_sources[OPENCL_SOURCE_NB];
> +
> +    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
> +
> +    switch(ctx->tonemap) {
> +    case TONEMAP_GAMMA:
> +        if (isnan(ctx->param))
> +            ctx->param = 1.8f;
> +        break;
> +    case TONEMAP_REINHARD:
> +        if (!isnan(ctx->param))
> +            ctx->param = (1.0f - ctx->param) / ctx->param;
> +        break;
> +    case TONEMAP_MOBIUS:
> +        if (isnan(ctx->param))
> +            ctx->param = 0.3f;
> +        break;
> +    }
> +
> +    if (isnan(ctx->param))
> +        ctx->param = 1.0f;
> +
> +    av_bprintf(&header, "__constant const float tone_param = %.4ff;\n",
> +               ctx->param);
> +    av_bprintf(&header, "__constant const float sdr_avg = %.4ff;\n", sdr_avg);
> +    av_bprintf(&header, "__constant const float scene_threshold = %.4ff;\n",
> +               scene_threshold);
> +    av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx->tonemap]);
> +    av_bprintf(&header, "#define DETECTION_FRAMES %d\n", DETECTION_FRAMES);
> +
> +    if (ctx->primaries_out != ctx->primaries_in) {
> +        get_rgb2rgb_matrix(ctx->primaries_in, ctx->primaries_out, rgb2rgb);
> +        rgb2rgb_passthrough = 0;
> +    }
> +    if (ctx->range_in == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_IN\n");
> +
> +    if (ctx->range_out == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_OUT\n");
> +
> +    if (rgb2rgb_passthrough)
> +        av_bprintf(&header, "#define RGB2RGB_PASSTHROUGH\n");
> +    else {
> +        av_bprintf(&header, "__constant float rgb2rgb[9] = {\n");
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[0][0], rgb2rgb[0][1], rgb2rgb[0][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[1][0], rgb2rgb[1][1], rgb2rgb[1][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff};\n",
> +                   rgb2rgb[2][0], rgb2rgb[2][1], rgb2rgb[2][2]);
> +    }
> +
> +    av_bprintf(&header, "#define rgb_matrix %s\n",
> +               rgb_coff[ctx->colorspace_in]);

You need to check ctx->colorspace_in before this point - if it isn't a supported value then the kernel fails to compile.  (And it can go off the end if the user builds with a higher value of AVCOL_SPC_NB.)

Similarly the other function name defines below.

> +    av_bprintf(&header, "#define yuv_matrix %s\n",
> +               yuv_coff[ctx->colorspace_out]);
> +
> +    luma_src = luma_coefficients[ctx->colorspace_in];
> +    luma_dst = luma_coefficients[ctx->colorspace_out];
> +    av_bprintf(&header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_src.cr, luma_src.cg, luma_src.cb);
> +    av_bprintf(&header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_dst.cr, luma_dst.cg, luma_dst.cb);
> +
> +    av_bprintf(&header, "#define linearize %s\n", linearize_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define delinearize %s\n",
> +               delinearize_funcs[ctx->trc_out]);
> +
> +    av_bprintf(&header, "#define ootf_impl %s\n", ootf_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define inverse_ootf_impl %s\n",
> +               inverse_ootf_funcs[ctx->trc_in]);
> +
> +
> +    opencl_sources[0] = header.str;
> +    opencl_sources[1] = ff_opencl_source_tonemap;
> +    opencl_sources[2] = ff_opencl_source_colorspace_basic;
> +    err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
> +
> +    av_bprint_finalize(&header, NULL);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    if (!ctx->command_queue) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
> +               "command queue: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->kernel = clCreateKernel(ctx->ocf.program, "tonemap", &cle);
> +    if (!ctx->kernel) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->util_mem = clCreateBuffer(ctx->ocf.hwctx->context,
> +                                   CL_MEM_USE_HOST_PTR |
> +                                   CL_MEM_HOST_NO_ACCESS,
> +                                   sizeof(ctx->util_buf), ctx->util_buf, &cle);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create util buffer: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->kernel)
> +        clReleaseKernel(ctx->kernel);
> +    return err;
> +}
> +
> +static int tonemap_opencl_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *avctx = outlink->src;
> +    TonemapOpenCLContext *s = avctx->priv;
> +    int ret;
> +    s->ocf.output_format = s->format == AV_PIX_FMT_NONE ? AV_PIX_FMT_NV12 : s->format;
> +    ret = ff_opencl_filter_config_output(outlink);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int launch_kernel(AVFilterContext *avctx, cl_kernel kernel,
> +                         AVFrame *output, AVFrame *input, float peak) {
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int err = AVERROR(ENOSYS);
> +    size_t global_work[2];
> +    size_t local_work[2];
> +    cl_int cle;
> +
> +    cle = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 4, sizeof(cl_mem), &ctx->util_mem);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 5, sizeof(cl_float), &peak);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "peak luma: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    local_work[0]  = 16;
> +    local_work[1]  = 16;
> +    // Note the work size based on uv plane, as we process a 2x2 quad in one workitem
> +    err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
> +                                                1, 16);
> +    if (err < 0)
> +        return err;
> +
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
> +                                 global_work, local_work,
> +                                 0, NULL, NULL);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to enqueue kernel: %d.\n",
> +               cle);
> +        return AVERROR(EIO);
> +    }
> +    return 0;
> +}
> +
> +static double determine_signal_peak(AVFrame *in)
> +{
> +    AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
> +    double peak = 0;
> +
> +    if (sd) {
> +        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;
> +        peak = clm->MaxCLL / REFERENCE_WHITE;
> +    }
> +
> +    sd = av_frame_get_side_data(in, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
> +    if (!peak && sd) {
> +        AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
> +        if (metadata->has_luminance)
> +            peak = av_q2d(metadata->max_luminance) / REFERENCE_WHITE;
> +    }
> +
> +    // if not SMPTE2084, we would assume HLG
> +    if (!peak)
> +        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;
> +
> +    return peak;
> +}
> +
> +static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    cl_int cle;
> +    int err;
> +    double peak = ctx->peak;
> +
> +    AVHWFramesContext *input_frames_ctx =
> +        (AVHWFramesContext*)input->hw_frames_ctx->data;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!peak)
> +        peak = determine_signal_peak(input);
> +
> +    if (ctx->trc != -1)
> +        output->color_trc = ctx->trc;
> +    if (ctx->primaries != -1)
> +        output->color_primaries = ctx->primaries;
> +    if (ctx->colorspace != -1)
> +        output->colorspace = ctx->colorspace;
> +    if (ctx->range != -1)
> +        output->color_range = ctx->range;
> +
> +    ctx->trc_in = input->color_trc;
> +    ctx->trc_out = output->color_trc;
> +    ctx->colorspace_in = input->colorspace;
> +    ctx->colorspace_out = output->colorspace;
> +    ctx->primaries_in = input->color_primaries;
> +    ctx->primaries_out = output->color_primaries;
> +    ctx->range_in = input->color_range;
> +    ctx->range_out = output->color_range;
> +
> +    if (!ctx->initialised) {
> +        err = tonemap_opencl_init(avctx);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    switch(input_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_P010:
> +        err = launch_kernel(avctx, ctx->kernel, output, input, peak);
> +        if (err < 0) goto fail;
> +        break;
> +    default:
> +        av_log(ctx, AV_LOG_ERROR, "unsupported format in tonemap_opencl.\n");
> +        err = AVERROR(ENOSYS);
> +        goto fail;
> +    }
> +
> +    cle = clFinish(ctx->command_queue);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",
> +               cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }

It might be nice to add some debug output here showing the what transformation was actually applied and maybe some of the persistent parameters from util_buf (they would be easier to verify as sensible).

> +
> +    av_frame_free(&input);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->kernel) {
> +        cle = clReleaseKernel(ctx->kernel);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "kernel: %d.\n", cle);
> +    }
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}
> +
> +#define OFFSET(x) offsetof(TonemapOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption tonemap_opencl_options[] = {
> +    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_NONE}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
> +    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
> +    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
> +    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
> +    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
> +    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
> +    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
> +    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
> +    { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
> +    { "t",        "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},         0, 0, FLAGS, "transfer" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},     0, 0, FLAGS, "transfer" },
> +    { "matrix", "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    { "m",      "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },
> +    { "primaries", "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    { "p",         "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},         0, 0, FLAGS, "primaries" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},        0, 0, FLAGS, "primaries" },
> +    { "range",         "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    { "r",             "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    {     "tv",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "pc",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    {     "limited",       0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "full",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    { "format",    "output pixel format", OFFSET(format), AV_OPT_TYPE_INT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_GBRAP12LE, FLAGS, "fmt" },
> +    {     "nv12",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_NV12},          0, 0, FLAGS, "fmt" },
> +    {     "p010",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_P010},          0, 0, FLAGS, "fmt" },

Can you use AV_OPT_TYPE_PIXFMT?

> +    { "peak",      "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
> +    { "param",     "tonemap parameter",   OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(tonemap_opencl);
> +
> +static const AVFilterPad tonemap_opencl_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = &tonemap_opencl_filter_frame,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad tonemap_opencl_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &tonemap_opencl_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_tonemap_opencl = {
> +    .name           = "tonemap_opencl",
> +    .description    = NULL_IF_CONFIG_SMALL("perform HDR to SDR conversion with tonemapping"),
> +    .priv_size      = sizeof(TonemapOpenCLContext),
> +    .priv_class     = &tonemap_opencl_class,
> +    .init           = &ff_opencl_filter_init,
> +    .uninit         = &tonemap_opencl_uninit,
> +    .query_formats  = &ff_opencl_filter_query_formats,
> +    .inputs         = tonemap_opencl_inputs,
> +    .outputs        = tonemap_opencl_outputs,
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
>
Ruiling Song May 22, 2018, 2:11 a.m. UTC | #3
> -----Original Message-----

> From: mypopy@gmail.com [mailto:mypopy@gmail.com]

> Sent: Monday, May 21, 2018 3:23 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Cc: sw@jkqxz.net; ffmpeg@haasn.xyz; Song, Ruiling <ruiling.song@intel.com>

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> 

> 2018-05-21 14:50 GMT+08:00 Ruiling Song <ruiling.song@intel.com>:

> > This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.

> >

> > An example command to use this filter with vaapi codecs:

> > FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \

> > opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \

> > vaapi -i INPUT -filter_hw_device ocl -filter_complex \

> > '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \

> > [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2

> OUTPUT

> >

> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > ---

> >  configure                              |   1 +

> >  libavfilter/Makefile                   |   2 +

> >  libavfilter/allfilters.c               |   1 +

> >  libavfilter/colorspace_basic.c         |  89 ++++++

> >  libavfilter/colorspace_basic.h         |  40 +++

> >  libavfilter/opencl/colorspace_basic.cl | 179 +++++++++++

> >  libavfilter/opencl/tonemap.cl          | 258 +++++++++++++++

> >  libavfilter/opencl_source.h            |   2 +

> >  libavfilter/vf_tonemap_opencl.c        | 560

> +++++++++++++++++++++++++++++++++

> >  9 files changed, 1132 insertions(+)

> >  create mode 100644 libavfilter/colorspace_basic.c

> >  create mode 100644 libavfilter/colorspace_basic.h

> >  create mode 100644 libavfilter/opencl/colorspace_basic.cl

> >  create mode 100644 libavfilter/opencl/tonemap.cl

> >  create mode 100644 libavfilter/vf_tonemap_opencl.c

> >

> > diff --git a/libavfilter/opencl/colorspace_basic.cl

> b/libavfilter/opencl/colorspace_basic.cl

> > new file mode 100644

> > index 0000000..ffd98c2

> > --- /dev/null

> > +++ b/libavfilter/opencl/colorspace_basic.cl

> > @@ -0,0 +1,179 @@

> > +/*

> > + * This file is part of FFmpeg.

> > + *

> > + * FFmpeg is free software; you can redistribute it and/or

> > + * modify it under the terms of the GNU Lesser General Public

> > + * License as published by the Free Software Foundation; either

> > + * version 2.1 of the License, or (at your option) any later version.

> > + *

> > + * FFmpeg is distributed in the hope that it will be useful,

> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of

> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

> > + * Lesser General Public License for more details.

> > + *

> > + * You should have received a copy of the GNU Lesser General Public

> > + * License along with FFmpeg; if not, write to the Free Software

> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301

> USA

> > + */

> > +

> > +#define ST2084_MAX_LUMINANCE 10000.0f

> > +#define REFERENCE_WHITE 100.0f

> > +constant const float ST2084_M1 = 0.1593017578125f;

> > +constant const float ST2084_M2 = 78.84375f;

> > +constant const float ST2084_C1 = 0.8359375f;

> > +constant const float ST2084_C2 = 18.8515625f;

> > +constant const float ST2084_C3 = 18.6875f;

> > +

> > +// TODO Move these colorspace matrix to .cpp files

> what's .cpp files? is it porting from some cpp file?

Sorry, this is a typo. It should be '.c'. It's better to move this matrix generation into .c file, so that it is easy to support more color spaces.
But currently, it is not so urgent as the tonemap only cares about bt709/bt2020 now.

Thanks!
Ruiling

> > +__constant float yuv2rgb_bt2020[] = {

> > +    1.0f, 0.0f, 1.4746f,

> > +    1.0f, -0.16455f, -0.57135f,

> > +    1.0f, 1.8814f, 0.0f

> > +};

> > +

> >

> > _______________________________________________

> > ffmpeg-devel mailing list

> > ffmpeg-devel@ffmpeg.org

> > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> 

> 

> --

> =======================================

> Pixelworks

> Room 301-303 No. 88,Lane 887 Zuchongzhi Road, Zhangjiang Hi-tech Park,

> Shanghai 201203, China

> Best Regards,

> Jun zhao/赵军

> +++++++++++++++++++++++++++++++++++++++
Niklas Haas May 22, 2018, 2:28 a.m. UTC | #4
On Tue, 22 May 2018 01:18:30 +0100, Mark Thompson <sw@jkqxz.net> wrote:
> On 21/05/18 07:50, Ruiling Song wrote:
> > This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
> > 
> > An example command to use this filter with vaapi codecs:
> > FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
> > opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
> > vaapi -i INPUT -filter_hw_device ocl -filter_complex \
> > '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
> > [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT
> > 
> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>
> > ---
> 
> I assume you're testing with Beignet for this sort of mapping to work?  I tried it with Beignet on Coffee Lake with 10-bit videos and it looks sensible, though it is rather hard to tell whether it is in some sense "correct".

It's also rather hard to define whether it is in some sense "correct".
The methodology employed here is generally based on ITU-R
recommendations, however the ITU-R advises multiple possible ways of
doing tone-mapping. They also highlight their own curve function, which
we don't use (for performance/simplicity reasons - iirc I gave it a try
and the result was not visually dissimilar enough from the hable
function, but my memory could be wrong). There's nothing resembling an
official "standard" way to tone-map defined by the ITU-R.

This algorithm is also generally based on the results obtained from the
"official" ACES implementation of HDR->SDR tone mapping (obtainable
here: https://github.com/ampas/aces-dev), with the key difference that
we do chroma-invariant tone mapping whereas hollywood tends to use
channel-independent tone mapping. I think the latter distorts the colors
too much for taste and generally results in a worse looking image. The
only important bit to make chroma-invariant tone mapping work well,
however, is the need for a good desaturation algorithm. This one is
based on original research and experimentation. The desaturation
strength with a parameter of 1.0 is comparable to the one achieved by
the ACES algorithm, although I pick a lower strength by default (0.5),
because I found it too damaging for some types of sources (particularly
bright skies) as a result of the chroma-invariant nature.

In addition to the desaturation step, the other key innovation which I
cannot find mentioned in ITU-R literature is the importance of adjusting
the overall average brightness before tone mapping. I suspect the reason
this isn't considered by the ITU-R is because the ITU-R assumes that HDR
sources actually follow their standards, which in practice none seem to
do. In theory, HDR material isn't supposed to have a significantly
higher average brightness than SDR material. Apart from the addition of
isolated super-highlights, nothing should have changed about the image
appearance. In practice, HDR directors like to point their expensive
cameras at very bright objects (e.g. the sun) and blind viewers' eyes by
failing to adjust the brightness during the mastering step. Our
algorithm compensates for this by essentially "correcting" the bad
mastering in real-time. [1] Of course, the result here is not as good as
doing it ahead of time by a human, but unfortunately we don't have a say
in this matter.

As long as the implementation is correct, I'd be confident in assuming
that this produces pleasurable results for all the sources I've thrown
at it, often even exceeding in quality the "official" SDR-mapped blu-ray
versions of the same sources on the same scenes. (Partially due to the
preserved higher color gamut)

In order to ascertain whether or not the implementation is correct, you
could compare it to results obtained by latest `mpv` (might need git
master) or `libplacebo`, both of which implement the same algorithm.


[1] The algorithm I use in mpv and libplacebo does this with one frame
of latency, because I don't want to round-trip through an intermediate
buffer in my processing chain, and there's no other way to communicate
back the measured frame statistics to the rest of the kernels in
OpenGL/Vulkan land. I do this because of my realtime requirements as
well as the structure of my processing chain.

Since you are implementing an offline filter and neither of these
restrictions apply to you, I would recommend changing the implementation
to separate the peak measurement step from the tone mapping step, so
that the former completes first and then the second runs from scratch
and can use the results computed by the former in the same frame.

If you don't do this, you run the risk of failing to tone map single
frame data (e.g. screenshots), because no data about the previous frame
is available at the time.

> > +// detect peak/average signal of a frame, the algorithm was ported from:
> > +// libplacebo (https://github.com/haasn/libplacebo)
> > +struct detection_result
> > +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
> > +            float signal, float peak) {
> > +    global uint *avg_buf = util_buf;
> > +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
> > +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
> > +    global uint *max_total_p = counter_wg_p + 1;
> > +    global uint *avg_total_p = max_total_p + 1;
> > +    global uint *frame_idx_p = avg_total_p + 1;
> > +    global uint *scene_frame_num_p = frame_idx_p + 1;
> > +
> > +    uint frame_idx = *frame_idx_p;
> > +    uint scene_frame_num = *scene_frame_num_p;
> > +
> > +    size_t lidx = get_local_id(0);
> > +    size_t lidy = get_local_id(1);
> > +    size_t lsizex = get_local_size(0);
> > +    size_t lsizey = get_local_size(1);
> > +    uint num_wg = get_num_groups(0) * get_num_groups(1);
> > +    size_t group_idx = get_group_id(0);
> > +    size_t group_idy = get_group_id(1);
> > +    struct detection_result r = {peak, sdr_avg};
> > +    *sum_wg = 0;
> 
> This is technically a data race - maybe set it in only the first workitem?

I'm not sure where the data race is here. There's a barrier immediately
below it, which ensures that all of the *sum_wg writes must complete
before progressing further, no? So even though all of the threads conflict
in their write to *sum_wg, they all write the same thing and wait for
each other before continuing.

> 
> > +    barrier(CLK_LOCAL_MEM_FENCE);
> > +
> > +    // update workgroup sum
> > +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));
> 
> I think the numbers you're adding together here sum to at most something like 16 * 16 * 100 * 1023?  Can you make sure this can't overflow and add a comment on that.

It's not * 1023, the highest possible peak in practice is 100.0 (PQ's
peak brightness). So the limit per workgroup is 16 * 16 * 10000,
requiring 22 bits to not overflow on a pathological input.

> 
> > +    barrier(CLK_LOCAL_MEM_FENCE);
> > +
> > +    // update frame peak/avg using work-group-average.
> > +    if (lidx == 0 && lidy == 0) {
> > +        uint avg_wg = *sum_wg / (lsizex * lsizey);
> > +        atomic_max(&peak_buf[frame_idx], avg_wg);
> > +        atomic_add(&avg_buf[frame_idx], avg_wg);
> 
> Similarly this one?  (width/16 * height/16 * 100 * 1023, I think, which might overflow for 8K?)

For 8K it's 8192/16 * 4320/16 * 10000, requiring 31 bits to store
without theoretical risk of overflow.

And actually, there is a third source of overflow worth investigating,
namely the *avg_total_p variable, since this accumulates across frames.
It stores a value of 10000 * (PEAK_DETECTION_FRAMES+1). In practice,
however, this shouldn't cause any issues for typical buffer sizes.
(Needing 20 bits for a buffer size of 100).

Note: In practice, none of these considerations are that worth worrying
about, since the average illumination of a scene is generally around at
most 50, so it's more like 23 bits needed to store a typical scene
rather than the 31 worst case I calculated earlier. The only scenario in
which I could imagine a worst case like that occurring in normal content
is if some mastering engineer mistakenly implements a "fade to white" by
fading to the highest possible HDR peak, and this were to somehow
survive being reviewed by other humans who presumably have functioning
retinas that would be screaming in pain as their displays blasted 10000
cd/m² during the fade.

> > +    // de-saturate
> > +    if (desat_param > 0.0f) {
> > +        float luma = get_luma_dst(rgb);
> > +        float base = 0.18f * dst_peak;
> 
> Magic number might want some explaination.

It is derived from experimentation and visual comparisons with e.g. the
ACES algorithm. There is no theoretical basis for it.

> +float3 ootf_hlg(float3 c) {
> +    float luma = get_luma_src(c);
> +    // assume a reference display with 1000 nits peak
> +    float factor = 1000.0f / REFERENCE_WHITE * pow(luma, 0.2f) / pow(12.0f, 1.2f);
> +    return c * factor;
> +}
> +
> +float3 inverse_ootf_hlg(float3 c) {
> +    // assume a reference display with 1000 nits peak
> +    c *=  pow(12.0f, 1.2f) / (1000.0f / REFERENCE_WHITE);
> +    c /= pow(get_luma_dst(c), 0.2f / 1.2f);
> +    return c;
> +}

I would recommend parametrizing these by the peak variable. When you
tone map from HLG to HLG at a lower peak, the inverse OOTF call needs to
use the new peak. (You should also update the peak tagging in the
frame's side channel data, not sure if you do).

Ditto, for the forwards OOTF, the `peak` needs to match the value you
assume for the src sig peak down below. You have it hard-coded as 12.0
for HLG, which is the correct peak in scene-referred space, but that
doesn't necessarily need to match the display referred case, which is
what's relevant for tone mapping. If you tune the OOTF for a 1000 nits
peak display, the source peak after applying the OOTF would be 10.0, not
12.0. Alternatively, you could tune the OOTF for 1200 nits instead.
(This is what libplacebo does, although I think not intentionally. I'll
change it to use 1000 nits as well.)
Ruiling Song May 22, 2018, 8:48 a.m. UTC | #5
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Mark Thompson

> Sent: Tuesday, May 22, 2018 8:19 AM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> 

> On 21/05/18 07:50, Ruiling Song wrote:

> > This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.

> >

> > An example command to use this filter with vaapi codecs:

> > FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \

> > opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \

> > vaapi -i INPUT -filter_hw_device ocl -filter_complex \

> > '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \

> > [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2

> OUTPUT

> >

> > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > ---

> 

> I assume you're testing with Beignet for this sort of mapping to work?  I tried it

> with Beignet on Coffee Lake with 10-bit videos and it looks sensible, though it is

> rather hard to tell whether it is in some sense "correct".

> 

> Given a non-P010 input video it fails with build errors when compling the kernels:

> 

> [Parsed_tonemap_opencl_1 @ 0x55b700e51540] Failed to build program: -11.

> [Parsed_tonemap_opencl_1 @ 0x55b700e51540] Build log:

> /home/mrt/video/ffmpeg/opencl/libavfilter/opencl/colorspace_basic.cl:125:19:

> error: use of undeclared identifier 'null'; did you mean 'all'?

> stringInput.cl:7:21: note: expanded from macro 'rgb_matrix'

> 

> That case should probably be caught earlier and rejected with a clear message.

Will fix it.

> 

> 

> On Mali:

> 

> $ ./ffmpeg_g -v 55 -y -i ~/test/The\ World\ in\ HDR.mkv -init_hw_device opencl

> -filter_hw_device opencl0 -an -vf

> 'format=p010,hwupload,tonemap_opencl=t=bt2020:tonemap=linear:format=p0

> 10,hwdownload,format=p010' -c:v libx264 out.mp4

> ...

> [tonemap_opencl @ 0x8201d7c0] Filter input: opencl, 3840x2160 (0).

> [Parsed_tonemap_opencl_2 @ 0x8201d760] Failed to enqueue kernel: -5.

The error seems map to OpenCL error CL_OUT_OF_RESOURCES. I don't have any idea yet.
May be some limitation in the driver not queried?

> 

> That's an RK3288 with a Mali T760, clinfo: <https://0x0.st/se5r.txt>, full log:

> <https://0x0.st/se5s.log>.

> 

> (The Rockchip hardware decoder can do H.265 Main 10, but the output format

> isn't P010 so it's easier to use VP9 here.)

Not p010? Then which format? Planar?
And I don't quite understand here. What the relationship of format with VP9?

> 

> 

> Some more thoughts below, I haven't read through all of it carefully.

Thanks for your comments. Answers inline.

> 

> Thanks,

> 

> - Mark

> 

> 

> >  configure                              |   1 +

> >  libavfilter/Makefile                   |   2 +

> >  libavfilter/allfilters.c               |   1 +

> >  libavfilter/colorspace_basic.c         |  89 ++++++

> >  libavfilter/colorspace_basic.h         |  40 +++

> >  libavfilter/opencl/colorspace_basic.cl | 179 +++++++++++

> >  libavfilter/opencl/tonemap.cl          | 258 +++++++++++++++

> >  libavfilter/opencl_source.h            |   2 +

> >  libavfilter/vf_tonemap_opencl.c        | 560

> +++++++++++++++++++++++++++++++++

> >  9 files changed, 1132 insertions(+)

> >  create mode 100644 libavfilter/colorspace_basic.c

> >  create mode 100644 libavfilter/colorspace_basic.h

> >  create mode 100644 libavfilter/opencl/colorspace_basic.cl

> >  create mode 100644 libavfilter/opencl/tonemap.cl

> >  create mode 100644 libavfilter/vf_tonemap_opencl.c

> >

> > ...

> > diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl

> > new file mode 100644

> > index 0000000..03cf3e2

> > --- /dev/null

> > +++ b/libavfilter/opencl/tonemap.cl

> > @@ -0,0 +1,258 @@

> > +/*

> > + * This file is part of FFmpeg.

> > + *

> > + * FFmpeg is free software; you can redistribute it and/or

> > + * modify it under the terms of the GNU Lesser General Public

> > + * License as published by the Free Software Foundation; either

> > + * version 2.1 of the License, or (at your option) any later version.

> > + *

> > + * FFmpeg is distributed in the hope that it will be useful,

> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of

> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

> > + * Lesser General Public License for more details.

> > + *

> > + * You should have received a copy of the GNU Lesser General Public

> > + * License along with FFmpeg; if not, write to the Free Software

> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301

> USA

> > + */

> > +

> > +#define REFERENCE_WHITE 100.0f

> > +extern float3 lrgb2yuv(float3);

> > +extern float3 yuv2lrgb(float3);

> > +extern float3 lrgb2lrgb(float3);

> > +extern float get_luma_src(float3);

> > +extern float get_luma_dst(float3);

> > +extern float3 ootf(float3);

> > +extern float3 inverse_ootf(float3);

> > +struct detection_result {

> > +    float peak;

> > +    float average;

> > +};

> > +

> > +float hable_f(float in) {

> > +    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;

> > +    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;

> > +}

> > +

> > +float direct(float s, float peak) {

> > +    return s;

> > +}

> > +

> > +float linear(float s, float peak) {

> > +    return s * tone_param / peak;

> > +}

> > +

> > +float gamma(float s, float peak) {

> > +    float p = s > 0.05f ? s /peak : 0.05f / peak;

> > +    float v = pow(p, 1.0f / tone_param);

> > +    return s > 0.05f ? v : (s * v /0.05f);

> > +}

> > +

> > +float clip(float s, float peak) {

> > +    return clamp(s * tone_param, 0.0f, 1.0f);

> > +}

> > +

> > +float reinhard(float s, float peak) {

> > +    return s / (s + tone_param) * (peak + tone_param) / peak;

> > +}

> > +

> > +float hable(float s, float peak) {

> > +    return hable_f(s)/hable_f(peak);

> > +}

> > +

> > +float mobius(float s, float peak) {

> > +    float j = tone_param;

> > +    float a, b;

> > +

> > +    if (s <= j)

> > +        return s;

> > +

> > +    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);

> > +    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);

> > +

> > +    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);

> > +}

> > +

> > +// detect peak/average signal of a frame, the algorithm was ported from:

> > +// libplacebo (https://github.com/haasn/libplacebo)

> > +struct detection_result

> > +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,

> > +            float signal, float peak) {

> > +    global uint *avg_buf = util_buf;

> > +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;

> > +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;

> > +    global uint *max_total_p = counter_wg_p + 1;

> > +    global uint *avg_total_p = max_total_p + 1;

> > +    global uint *frame_idx_p = avg_total_p + 1;

> > +    global uint *scene_frame_num_p = frame_idx_p + 1;

> > +

> > +    uint frame_idx = *frame_idx_p;

> > +    uint scene_frame_num = *scene_frame_num_p;

> > +

> > +    size_t lidx = get_local_id(0);

> > +    size_t lidy = get_local_id(1);

> > +    size_t lsizex = get_local_size(0);

> > +    size_t lsizey = get_local_size(1);

> > +    uint num_wg = get_num_groups(0) * get_num_groups(1);

> > +    size_t group_idx = get_group_id(0);

> > +    size_t group_idy = get_group_id(1);

> > +    struct detection_result r = {peak, sdr_avg};

> > +    *sum_wg = 0;

> 

> This is technically a data race - maybe set it in only the first workitem?

When writing same value to it, this may be fine, we should still get correct result.
But I agree it is better to only ask the first work-item to do the initialization.

> 

> > +    barrier(CLK_LOCAL_MEM_FENCE);

> > +

> > +    // update workgroup sum

> > +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));

> 

> I think the numbers you're adding together here sum to at most something like

> 16 * 16 * 100 * 1023?  Can you make sure this can't overflow and add a

> comment on that.

Niklas also pointed this out. It is 16 * 16 * 10000 at max. so, no overflow here.

> 

> > +    barrier(CLK_LOCAL_MEM_FENCE);

> > +

> > +    // update frame peak/avg using work-group-average.

> > +    if (lidx == 0 && lidy == 0) {

> > +        uint avg_wg = *sum_wg / (lsizex * lsizey);

> > +        atomic_max(&peak_buf[frame_idx], avg_wg);

> > +        atomic_add(&avg_buf[frame_idx], avg_wg);

> 

> Similarly this one?  (width/16 * height/16 * 100 * 1023, I think, which might

> overflow for 8K?)

> 

> > +    }

> > +

> > +    if (scene_frame_num > 0) {

> > +        float peak = (float)*max_total_p / (REFERENCE_WHITE *

> scene_frame_num);

> > +        float avg = (float)*avg_total_p / (REFERENCE_WHITE *

> scene_frame_num);

> > +        r.peak = max(1.0f, peak);

> > +        r.average = max(0.25f, avg);

> 

> fmax()?  (max() is an integer function, not sure what it does to 0.25f.)

min()/max() also accept floating point values. You can refer chapter "6.12.4 Common Functions" in OpenCL Spec 1.2
> 

> > +    }

> > +

> > +    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1)

> {

> > +        *counter_wg_p = 0;

> > +        avg_buf[frame_idx] /= num_wg;

> > +

> > +        if (scene_threshold > 0.0f) {

> > +            uint cur_max = peak_buf[frame_idx];

> > +            uint cur_avg = avg_buf[frame_idx];

> > +            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;

> > +

> > +            if (abs(diff) > scene_frame_num * scene_threshold *

> REFERENCE_WHITE) {

> > +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)

> > +                  avg_buf[i] = 0;

> > +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)

> > +                  peak_buf[i] = 0;

> > +                *avg_total_p = *max_total_p = 0;

> > +                *scene_frame_num_p = 0;

> > +                avg_buf[frame_idx] = cur_avg;

> > +                peak_buf[frame_idx] = cur_max;

> > +            }

> > +        }

> > +        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);

> > +        // add current frame, subtract next frame

> > +        *max_total_p += peak_buf[frame_idx] - peak_buf[next];

> > +        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];

> > +        // reset next frame

> > +        peak_buf[next] = avg_buf[next] = 0;

> > +        *frame_idx_p = next;

> > +        *scene_frame_num_p = min(*scene_frame_num_p + 1,

> (uint)DETECTION_FRAMES);

> > +    }

> > +    return r;

> > +}

> > +

> > +__constant const float desat_param = 0.5f;

> > +__constant const float dst_peak    = 1.0f;

> > +

> > +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {

> > +    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);

> 

> More max(), also below.

> 

> > +    // de-saturate

> > +    if (desat_param > 0.0f) {

> > +        float luma = get_luma_dst(rgb);

> > +        float base = 0.18f * dst_peak;

> 

> Magic number might want some explaination.

> 

> > +        float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);

> > +        coeff = native_powr(coeff, 10.0f / desat_param);

> > +        rgb = mix(rgb, (float3)luma, (float3)coeff);

> > +        sig = mix(sig, luma, coeff);

> > +    }

> > +

> > +    float sig_old = sig;

> > +    float slope = min(1.0f, sdr_avg / average);

> > +    sig *= slope;

> > +    peak *= slope;

> > +

> > +    sig = TONE_FUNC(sig, peak);

> > +    rgb *= (sig/sig_old);

> > +    return rgb;

> > +}

> > +// map from source space YUV to destination space RGB

> > +float3 map_to_dst_space_from_yuv(float3 yuv) {

> > +    float3 c = yuv2lrgb(yuv);

> > +    c = ootf(c);

> > +    c = lrgb2lrgb(c);

> > +    return c;

> > +}

> > +

> > +// convert from rgb to yuv, with possible inverse-ootf

> > +float3 convert_to_yuv(float3 c) {

> > +    c = inverse_ootf(c);

> > +    return lrgb2yuv(c);

> > +}

> > +

> > +__kernel void tonemap(__write_only image2d_t dst1,

> > +                      __write_only image2d_t dst2,

> > +                      __read_only  image2d_t src1,

> > +                      __read_only  image2d_t src2,

> > +#ifdef THIRD_PLANE

> > +                      __write_only image2d_t dst3,

> > +                      __read_only  image2d_t src3,

> > +#endif

> > +                      global uint *util_buf,

> > +                      float peak

> > +                      )

> > +{

> > +    __local uint sum_wg;

> > +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |

> > +                               CLK_FILTER_NEAREST);

> > +    int xi = get_global_id(0);

> > +    int yi = get_global_id(1);

> > +    // each work item process four pixels

> > +    int x = 2 * xi;

> > +    int y = 2 * yi;

> > +

> > +    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;

> > +    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;

> > +    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;

> > +    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;

> > +#ifdef THIRD_PLANE

> > +    float u = read_imagef(src2, sampler, (int2)(xi, yi)).x;

> > +    float v = read_imagef(src3, sampler, (int2)(xi, yi)).x;

> > +    float2 uv = (float2)(u, v);

> > +#else

> > +    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;

> > +#endif

> > +

> > +    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y));

> > +    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y));

> > +    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y));

> > +    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y));

> > +

> > +    float sig0 = max(c0.x, max(c0.y, c0.z));

> > +    float sig1 = max(c1.x, max(c1.y, c1.z));

> > +    float sig2 = max(c2.x, max(c2.y, c2.z));

> > +    float sig3 = max(c3.x, max(c3.y, c3.z));

> > +    float sig = max(sig0, max(sig1, max(sig2, sig3)));

> > +

> > +    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);

> > +

> > +    float3 c0_old = c0, c1_old = c1, c2_old = c2;

> > +    c0 = map_one_pixel_rgb(c0, r.peak, r.average);

> > +    c1 = map_one_pixel_rgb(c1, r.peak, r.average);

> > +    c2 = map_one_pixel_rgb(c2, r.peak, r.average);

> > +    c3 = map_one_pixel_rgb(c3, r.peak, r.average);

> > +

> > +    float3 yuv0 = convert_to_yuv(c0);

> > +    float3 yuv1 = convert_to_yuv(c1);

> > +    float3 yuv2 = convert_to_yuv(c2);

> > +    float3 yuv3 = convert_to_yuv(c3);

> > +

> > +    write_imagef(dst1, (int2)(x, y), (float4)(yuv0.x, 0.0f, 0.0f, 1.0f));

> > +    write_imagef(dst1, (int2)(x+1, y), (float4)(yuv1.x, 0.0f, 0.0f, 1.0f));

> > +    write_imagef(dst1, (int2)(x, y+1), (float4)(yuv2.x, 0.0f, 0.0f, 1.0f));

> > +    write_imagef(dst1, (int2)(x+1, y+1), (float4)(yuv3.x, 0.0f, 0.0f, 1.0f));

> > +#ifdef THIRD_PLANE

> > +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, 0.0f, 0.0f, 1.0f));

> > +    write_imagef(dst3, (int2)(xi, yi), (float4)(yuv0.z, 0.0f, 0.0f, 1.0f));

> > +#else

> > +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, yuv0.z, 0.0f, 1.0f));

> > +#endif

> > +}

> > diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h

> > index 4bb9969..c5b3f37 100644

> > --- a/libavfilter/opencl_source.h

> > +++ b/libavfilter/opencl_source.h

> > @@ -21,7 +21,9 @@

> >

> >  extern const char *ff_opencl_source_avgblur;

> >  extern const char *ff_opencl_source_convolution;

> > +extern const char *ff_opencl_source_colorspace_basic;

> >  extern const char *ff_opencl_source_overlay;

> > +extern const char *ff_opencl_source_tonemap;

> >  extern const char *ff_opencl_source_unsharp;

> >

> >  #endif /* AVFILTER_OPENCL_SOURCE_H */

> > diff --git a/libavfilter/vf_tonemap_opencl.c b/libavfilter/vf_tonemap_opencl.c

> > new file mode 100644

> > index 0000000..e2311e0

> > --- /dev/null

> > +++ b/libavfilter/vf_tonemap_opencl.c

> > @@ -0,0 +1,560 @@

> > +/*

> > + * This file is part of FFmpeg.

> > + *

> > + * FFmpeg is free software; you can redistribute it and/or

> > + * modify it under the terms of the GNU Lesser General Public

> > + * License as published by the Free Software Foundation; either

> > + * version 2.1 of the License, or (at your option) any later version.

> > + *

> > + * FFmpeg is distributed in the hope that it will be useful,

> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of

> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

> > + * Lesser General Public License for more details.

> > + *

> > + * You should have received a copy of the GNU Lesser General Public

> > + * License along with FFmpeg; if not, write to the Free Software

> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301

> USA

> > + */

> > +#include <float.h>

> > +

> > +#include "libavutil/bprint.h"

> > +#include "libavutil/common.h"

> > +#include "libavutil/imgutils.h"

> > +#include "libavutil/mastering_display_metadata.h"

> > +#include "libavutil/mem.h"

> > +#include "libavutil/opt.h"

> > +#include "libavutil/pixdesc.h"

> > +

> > +#include "avfilter.h"

> > +#include "internal.h"

> > +#include "opencl.h"

> > +#include "opencl_source.h"

> > +#include "video.h"

> > +#include "colorspace_basic.h"

> > +

> > +#define DETECTION_FRAMES 63

> > +#define REFERENCE_WHITE 100.0f

> > +

> > +enum TonemapAlgorithm {

> > +    TONEMAP_NONE,

> > +    TONEMAP_LINEAR,

> > +    TONEMAP_GAMMA,

> > +    TONEMAP_CLIP,

> > +    TONEMAP_REINHARD,

> > +    TONEMAP_HABLE,

> > +    TONEMAP_MOBIUS,

> > +    TONEMAP_MAX,

> > +};

> > +

> > +typedef struct TonemapOpenCLContext {

> > +    OpenCLFilterContext ocf;

> > +

> > +    enum AVColorSpace colorspace, colorspace_in, colorspace_out;

> > +    enum AVColorTransferCharacteristic trc, trc_in, trc_out;

> > +    enum AVColorPrimaries primaries, primaries_in, primaries_out;

> > +    enum AVColorRange range, range_in, range_out;

> > +

> > +    enum TonemapAlgorithm tonemap;

> > +    enum AVPixelFormat    format;

> > +    double                peak;

> > +    double                param;

> > +    int                   initialised;

> > +    cl_kernel             kernel;

> > +    cl_command_queue      command_queue;

> > +    cl_mem                util_mem;

> > +    DECLARE_ALIGNED(64, int32_t, util_buf)[2 * DETECTION_FRAMES + 7];

> > +} TonemapOpenCLContext;

> > +

> > +const char *yuv_coff[AVCOL_SPC_NB] = {

> > +    [AVCOL_SPC_BT709] = "rgb2yuv_bt709",

> > +    [AVCOL_SPC_BT2020_NCL] = "rgb2yuv_bt2020",

> > +};

> > +

> > +const char *rgb_coff[AVCOL_SPC_NB] = {

> > +    [AVCOL_SPC_BT709] = "yuv2rgb_bt709",

> > +    [AVCOL_SPC_BT2020_NCL] = "yuv2rgb_bt2020",

> > +};

> > +

> > +const char *linearize_funcs[AVCOL_TRC_NB] = {

> > +    [AVCOL_TRC_SMPTE2084] = "eotf_st2084",

> > +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_oetf_hlg",

> > +};

> > +

> > +const char *ootf_funcs[AVCOL_TRC_NB] = {

> > +    [AVCOL_TRC_ARIB_STD_B67] = "ootf_hlg",

> > +    [AVCOL_TRC_SMPTE2084] = "",

> > +};

> > +

> > +const char *inverse_ootf_funcs[AVCOL_TRC_NB] = {

> > +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_ootf_hlg",

> > +    [AVCOL_TRC_SMPTE2084] = "",

> > +};

> > +

> > +const char *delinearize_funcs[AVCOL_TRC_NB] = {

> > +    [AVCOL_TRC_BT709]     = "inverse_eotf_bt1886",

> > +    [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886",

> > +};

> > +

> > +static const struct LumaCoefficients luma_coefficients[AVCOL_SPC_NB] = {

> > +    [AVCOL_SPC_BT709]      = { 0.2126, 0.7152, 0.0722 },

> > +    [AVCOL_SPC_BT2020_NCL] = { 0.2627, 0.6780, 0.0593 },

> > +};

> > +

> > +struct ColorPrimaries primaries_table[AVCOL_PRI_NB] = {

> > +    [AVCOL_PRI_BT709]  = { 0.640, 0.330, 0.300, 0.600, 0.150, 0.060 },

> > +    [AVCOL_PRI_BT2020] = { 0.708, 0.292, 0.170, 0.797, 0.131, 0.046 },

> > +};

> > +

> > +struct WhitePoint whitepoint_table[AVCOL_PRI_NB] = {

> > +    [AVCOL_PRI_BT709]  = { 0.3127, 0.3290 },

> > +    [AVCOL_PRI_BT2020] = { 0.3127, 0.3290 },

> > +};

> > +

> > +const char *tonemap_func[TONEMAP_MAX] = {

> > +    [TONEMAP_NONE]     = "direct",

> > +    [TONEMAP_LINEAR]   = "linear",

> > +    [TONEMAP_GAMMA]    = "gamma",

> > +    [TONEMAP_CLIP]     = "clip",

> > +    [TONEMAP_REINHARD] = "reinhard",

> > +    [TONEMAP_HABLE]    = "hable",

> > +    [TONEMAP_MOBIUS]   = "mobius",

> > +};

> > +

> > +static void get_rgb2rgb_matrix(enum AVColorPrimaries in, enum

> AVColorPrimaries out,

> > +                               double rgb2rgb[3][3]) {

> > +    double rgb2xyz[3][3], xyz2rgb[3][3];

> > +

> > +    fill_rgb2xyz_table(&primaries_table[out], &whitepoint_table[out], rgb2xyz);

> > +    invert_matrix3x3(rgb2xyz, xyz2rgb);

> > +    fill_rgb2xyz_table(&primaries_table[in], &whitepoint_table[in], rgb2xyz);

> > +    mul3x3(rgb2rgb, rgb2xyz, xyz2rgb);

> > +}

> > +

> > +#define OPENCL_SOURCE_NB 3

> > +// Average light level for SDR signals. This is equal to a signal level of 0.5

> > +// under a typical presentation gamma of about 2.0.

> > +static const float sdr_avg = 0.25f;

> > +static const float scene_threshold = 0.2f;

> > +

> > +static int tonemap_opencl_init(AVFilterContext *avctx)

> > +{

> > +    TonemapOpenCLContext *ctx = avctx->priv;

> > +    int rgb2rgb_passthrough = 1;

> > +    double rgb2rgb[3][3];

> > +    struct LumaCoefficients luma_src, luma_dst;

> > +    cl_int cle;

> > +    int err;

> > +    AVBPrint header;

> > +    const char *opencl_sources[OPENCL_SOURCE_NB];

> > +

> > +    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);

> > +

> > +    switch(ctx->tonemap) {

> > +    case TONEMAP_GAMMA:

> > +        if (isnan(ctx->param))

> > +            ctx->param = 1.8f;

> > +        break;

> > +    case TONEMAP_REINHARD:

> > +        if (!isnan(ctx->param))

> > +            ctx->param = (1.0f - ctx->param) / ctx->param;

> > +        break;

> > +    case TONEMAP_MOBIUS:

> > +        if (isnan(ctx->param))

> > +            ctx->param = 0.3f;

> > +        break;

> > +    }

> > +

> > +    if (isnan(ctx->param))

> > +        ctx->param = 1.0f;

> > +

> > +    av_bprintf(&header, "__constant const float tone_param = %.4ff;\n",

> > +               ctx->param);

> > +    av_bprintf(&header, "__constant const float sdr_avg = %.4ff;\n", sdr_avg);

> > +    av_bprintf(&header, "__constant const float scene_threshold = %.4ff;\n",

> > +               scene_threshold);

> > +    av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx-

> >tonemap]);

> > +    av_bprintf(&header, "#define DETECTION_FRAMES %d\n",

> DETECTION_FRAMES);

> > +

> > +    if (ctx->primaries_out != ctx->primaries_in) {

> > +        get_rgb2rgb_matrix(ctx->primaries_in, ctx->primaries_out, rgb2rgb);

> > +        rgb2rgb_passthrough = 0;

> > +    }

> > +    if (ctx->range_in == AVCOL_RANGE_JPEG)

> > +        av_bprintf(&header, "#define FULL_RANGE_IN\n");

> > +

> > +    if (ctx->range_out == AVCOL_RANGE_JPEG)

> > +        av_bprintf(&header, "#define FULL_RANGE_OUT\n");

> > +

> > +    if (rgb2rgb_passthrough)

> > +        av_bprintf(&header, "#define RGB2RGB_PASSTHROUGH\n");

> > +    else {

> > +        av_bprintf(&header, "__constant float rgb2rgb[9] = {\n");

> > +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",

> > +                   rgb2rgb[0][0], rgb2rgb[0][1], rgb2rgb[0][2]);

> > +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",

> > +                   rgb2rgb[1][0], rgb2rgb[1][1], rgb2rgb[1][2]);

> > +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff};\n",

> > +                   rgb2rgb[2][0], rgb2rgb[2][1], rgb2rgb[2][2]);

> > +    }

> > +

> > +    av_bprintf(&header, "#define rgb_matrix %s\n",

> > +               rgb_coff[ctx->colorspace_in]);

> 

> You need to check ctx->colorspace_in before this point - if it isn't a supported

> value then the kernel fails to compile.  (And it can go off the end if the user

> builds with a higher value of AVCOL_SPC_NB.)

> 

> Similarly the other function name defines below.

Will add valid checks here and some debug message here. Thanks!

> 

> > +    av_bprintf(&header, "#define yuv_matrix %s\n",

> > +               yuv_coff[ctx->colorspace_out]);

> > +

> > +    luma_src = luma_coefficients[ctx->colorspace_in];

> > +    luma_dst = luma_coefficients[ctx->colorspace_out];

> > +    av_bprintf(&header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",

> > +               luma_src.cr, luma_src.cg, luma_src.cb);

> > +    av_bprintf(&header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",

> > +               luma_dst.cr, luma_dst.cg, luma_dst.cb);

> > +

> > +    av_bprintf(&header, "#define linearize %s\n", linearize_funcs[ctx->trc_in]);

> > +    av_bprintf(&header, "#define delinearize %s\n",

> > +               delinearize_funcs[ctx->trc_out]);

> > +

> > +    av_bprintf(&header, "#define ootf_impl %s\n", ootf_funcs[ctx->trc_in]);

> > +    av_bprintf(&header, "#define inverse_ootf_impl %s\n",

> > +               inverse_ootf_funcs[ctx->trc_in]);

> > +

> > +

> > +    opencl_sources[0] = header.str;

> > +    opencl_sources[1] = ff_opencl_source_tonemap;

> > +    opencl_sources[2] = ff_opencl_source_colorspace_basic;

> > +    err = ff_opencl_filter_load_program(avctx, opencl_sources,

> OPENCL_SOURCE_NB);

> > +

> > +    av_bprint_finalize(&header, NULL);

> > +    if (err < 0)

> > +        goto fail;

> > +

> > +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx-

> >context,

> > +                                              ctx->ocf.hwctx->device_id,

> > +                                              0, &cle);

> > +    if (!ctx->command_queue) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "

> > +               "command queue: %d.\n", cle);

> > +        err = AVERROR(EIO);

> > +        goto fail;

> > +    }

> > +

> > +    ctx->kernel = clCreateKernel(ctx->ocf.program, "tonemap", &cle);

> > +    if (!ctx->kernel) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);

> > +        err = AVERROR(EIO);

> > +        goto fail;

> > +    }

> > +

> > +    ctx->util_mem = clCreateBuffer(ctx->ocf.hwctx->context,

> > +                                   CL_MEM_USE_HOST_PTR |

> > +                                   CL_MEM_HOST_NO_ACCESS,

> > +                                   sizeof(ctx->util_buf), ctx->util_buf, &cle);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to create util buffer: %d.\n", cle);

> > +        err = AVERROR(EIO);

> > +        goto fail;

> > +    }

> > +

> > +    ctx->initialised = 1;

> > +    return 0;

> > +

> > +fail:

> > +    if (ctx->util_mem)

> > +        clReleaseMemObject(ctx->util_mem);

> > +    if (ctx->command_queue)

> > +        clReleaseCommandQueue(ctx->command_queue);

> > +    if (ctx->kernel)

> > +        clReleaseKernel(ctx->kernel);

> > +    return err;

> > +}

> > +

> > +static int tonemap_opencl_config_output(AVFilterLink *outlink)

> > +{

> > +    AVFilterContext *avctx = outlink->src;

> > +    TonemapOpenCLContext *s = avctx->priv;

> > +    int ret;

> > +    s->ocf.output_format = s->format == AV_PIX_FMT_NONE ?

> AV_PIX_FMT_NV12 : s->format;

> > +    ret = ff_opencl_filter_config_output(outlink);

> > +    if (ret < 0)

> > +        return ret;

> > +

> > +    return 0;

> > +}

> > +

> > +static int launch_kernel(AVFilterContext *avctx, cl_kernel kernel,

> > +                         AVFrame *output, AVFrame *input, float peak) {

> > +    TonemapOpenCLContext *ctx = avctx->priv;

> > +    int err = AVERROR(ENOSYS);

> > +    size_t global_work[2];

> > +    size_t local_work[2];

> > +    cl_int cle;

> > +

> > +    cle = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output->data[0]);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "

> > +               "destination image 1st plane: %d.\n", cle);

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    cle = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->data[1]);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "

> > +               "destination image 2nd plane: %d.\n", cle);

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    cle = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input->data[0]);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "

> > +               "source image 1st plane: %d.\n", cle);

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    cle = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input->data[1]);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "

> > +               "source image 2nd plane: %d.\n", cle);

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    cle = clSetKernelArg(kernel, 4, sizeof(cl_mem), &ctx->util_mem);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "

> > +               "source image 2nd plane: %d.\n", cle);

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    cle = clSetKernelArg(kernel, 5, sizeof(cl_float), &peak);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "

> > +               "peak luma: %d.\n", cle);

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +    local_work[0]  = 16;

> > +    local_work[1]  = 16;

> > +    // Note the work size based on uv plane, as we process a 2x2 quad in one

> workitem

> > +    err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,

> > +                                                1, 16);

> > +    if (err < 0)

> > +        return err;

> > +

> > +    cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,

> > +                                 global_work, local_work,

> > +                                 0, NULL, NULL);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to enqueue kernel: %d.\n",

> > +               cle);

> > +        return AVERROR(EIO);

> > +    }

> > +    return 0;

> > +}

> > +

> > +static double determine_signal_peak(AVFrame *in)

> > +{

> > +    AVFrameSideData *sd = av_frame_get_side_data(in,

> AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);

> > +    double peak = 0;

> > +

> > +    if (sd) {

> > +        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;

> > +        peak = clm->MaxCLL / REFERENCE_WHITE;

> > +    }

> > +

> > +    sd = av_frame_get_side_data(in,

> AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);

> > +    if (!peak && sd) {

> > +        AVMasteringDisplayMetadata *metadata =

> (AVMasteringDisplayMetadata *)sd->data;

> > +        if (metadata->has_luminance)

> > +            peak = av_q2d(metadata->max_luminance) / REFERENCE_WHITE;

> > +    }

> > +

> > +    // if not SMPTE2084, we would assume HLG

> > +    if (!peak)

> > +        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;

> > +

> > +    return peak;

> > +}

> > +

> > +static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)

> > +{

> > +    AVFilterContext    *avctx = inlink->dst;

> > +    AVFilterLink     *outlink = avctx->outputs[0];

> > +    TonemapOpenCLContext *ctx = avctx->priv;

> > +    AVFrame *output = NULL;

> > +    cl_int cle;

> > +    int err;

> > +    double peak = ctx->peak;

> > +

> > +    AVHWFramesContext *input_frames_ctx =

> > +        (AVHWFramesContext*)input->hw_frames_ctx->data;

> > +

> > +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",

> > +           av_get_pix_fmt_name(input->format),

> > +           input->width, input->height, input->pts);

> > +

> > +    if (!input->hw_frames_ctx)

> > +        return AVERROR(EINVAL);

> > +

> > +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);

> > +    if (!output) {

> > +        err = AVERROR(ENOMEM);

> > +        goto fail;

> > +    }

> > +

> > +    err = av_frame_copy_props(output, input);

> > +    if (err < 0)

> > +        goto fail;

> > +

> > +    if (!peak)

> > +        peak = determine_signal_peak(input);

> > +

> > +    if (ctx->trc != -1)

> > +        output->color_trc = ctx->trc;

> > +    if (ctx->primaries != -1)

> > +        output->color_primaries = ctx->primaries;

> > +    if (ctx->colorspace != -1)

> > +        output->colorspace = ctx->colorspace;

> > +    if (ctx->range != -1)

> > +        output->color_range = ctx->range;

> > +

> > +    ctx->trc_in = input->color_trc;

> > +    ctx->trc_out = output->color_trc;

> > +    ctx->colorspace_in = input->colorspace;

> > +    ctx->colorspace_out = output->colorspace;

> > +    ctx->primaries_in = input->color_primaries;

> > +    ctx->primaries_out = output->color_primaries;

> > +    ctx->range_in = input->color_range;

> > +    ctx->range_out = output->color_range;

> > +

> > +    if (!ctx->initialised) {

> > +        err = tonemap_opencl_init(avctx);

> > +        if (err < 0)

> > +            goto fail;

> > +    }

> > +

> > +    switch(input_frames_ctx->sw_format) {

> > +    case AV_PIX_FMT_P010:

> > +        err = launch_kernel(avctx, ctx->kernel, output, input, peak);

> > +        if (err < 0) goto fail;

> > +        break;

> > +    default:

> > +        av_log(ctx, AV_LOG_ERROR, "unsupported format in

> tonemap_opencl.\n");

> > +        err = AVERROR(ENOSYS);

> > +        goto fail;

> > +    }

> > +

> > +    cle = clFinish(ctx->command_queue);

> > +    if (cle != CL_SUCCESS) {

> > +        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",

> > +               cle);

> > +        err = AVERROR(EIO);

> > +        goto fail;

> > +    }

> 

> It might be nice to add some debug output here showing the what

> transformation was actually applied and maybe some of the persistent

> parameters from util_buf (they would be easier to verify as sensible).

I am not quite sure on this. What kind of message is preferred? Any specific idea?

> 

> > +

> > +    av_frame_free(&input);

> > +

> > +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",

> > +           av_get_pix_fmt_name(output->format),

> > +           output->width, output->height, output->pts);

> > +

> > +    return ff_filter_frame(outlink, output);

> > +

> > +fail:

> > +    clFinish(ctx->command_queue);

> > +    av_frame_free(&input);

> > +    av_frame_free(&output);

> > +    return err;

> > +}

> > +

> > +static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx)

> > +{

> > +    TonemapOpenCLContext *ctx = avctx->priv;

> > +    cl_int cle;

> > +

> > +    if (ctx->util_mem)

> > +        clReleaseMemObject(ctx->util_mem);

> > +    if (ctx->kernel) {

> > +        cle = clReleaseKernel(ctx->kernel);

> > +        if (cle != CL_SUCCESS)

> > +            av_log(avctx, AV_LOG_ERROR, "Failed to release "

> > +                   "kernel: %d.\n", cle);

> > +    }

> > +

> > +    if (ctx->command_queue) {

> > +        cle = clReleaseCommandQueue(ctx->command_queue);

> > +        if (cle != CL_SUCCESS)

> > +            av_log(avctx, AV_LOG_ERROR, "Failed to release "

> > +                   "command queue: %d.\n", cle);

> > +    }

> > +

> > +    ff_opencl_filter_uninit(avctx);

> > +}

> > +

> > +#define OFFSET(x) offsetof(TonemapOpenCLContext, x)

> > +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM |

> AV_OPT_FLAG_VIDEO_PARAM)

> > +static const AVOption tonemap_opencl_options[] = {

> > +    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap),

> AV_OPT_TYPE_INT, {.i64 = TONEMAP_NONE}, TONEMAP_NONE,

> TONEMAP_MAX - 1, FLAGS, "tonemap" },

> > +    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},

> 0, 0, FLAGS, "tonemap" },

> > +    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},

> 0, 0, FLAGS, "tonemap" },

> > +    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},

> 0, 0, FLAGS, "tonemap" },

> > +    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0,

> 0, FLAGS, "tonemap" },

> > +    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},

> 0, 0, FLAGS, "tonemap" },

> > +    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},

> 0, 0, FLAGS, "tonemap" },

> > +    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},

> 0, 0, FLAGS, "tonemap" },

> > +    { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT,

> {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },

> > +    { "t",        "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT,

> {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },

> > +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_TRC_BT709},         0, 0, FLAGS, "transfer" },

> > +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_TRC_BT2020_10},     0, 0, FLAGS, "transfer" },

> > +    { "matrix", "set colorspace matrix", OFFSET(colorspace),

> AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },

> > +    { "m",      "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT,

> {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },

> > +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },

> > +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },

> > +    { "primaries", "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT,

> {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },

> > +    { "p",         "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT,

> {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },

> > +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_PRI_BT709},         0, 0, FLAGS, "primaries" },

> > +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_PRI_BT2020},        0, 0, FLAGS, "primaries" },

> > +    { "range",         "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 =

> -1}, -1, INT_MAX, FLAGS, "range" },

> > +    { "r",             "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -

> 1}, -1, INT_MAX, FLAGS, "range" },

> > +    {     "tv",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },

> > +    {     "pc",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },

> > +    {     "limited",       0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },

> > +    {     "full",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },

> > +    { "format",    "output pixel format", OFFSET(format), AV_OPT_TYPE_INT,

> {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_GBRAP12LE,

> FLAGS, "fmt" },

> > +    {     "nv12",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AV_PIX_FMT_NV12},          0, 0, FLAGS, "fmt" },

> > +    {     "p010",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 =

> AV_PIX_FMT_P010},          0, 0, FLAGS, "fmt" },

> 

> Can you use AV_OPT_TYPE_PIXFMT?

Sure. I will try it.

> 

> > +    { "peak",      "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE,

> {.dbl = 0}, 0, DBL_MAX, FLAGS },

> > +    { "param",     "tonemap parameter",   OFFSET(param),

> AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },

> > +    { NULL }

> > +};

> > +

> > +AVFILTER_DEFINE_CLASS(tonemap_opencl);

> > +

> > +static const AVFilterPad tonemap_opencl_inputs[] = {

> > +    {

> > +        .name         = "default",

> > +        .type         = AVMEDIA_TYPE_VIDEO,

> > +        .filter_frame = &tonemap_opencl_filter_frame,

> > +        .config_props = &ff_opencl_filter_config_input,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +static const AVFilterPad tonemap_opencl_outputs[] = {

> > +    {

> > +        .name         = "default",

> > +        .type         = AVMEDIA_TYPE_VIDEO,

> > +        .config_props = &tonemap_opencl_config_output,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +AVFilter ff_vf_tonemap_opencl = {

> > +    .name           = "tonemap_opencl",

> > +    .description    = NULL_IF_CONFIG_SMALL("perform HDR to SDR conversion

> with tonemapping"),

> > +    .priv_size      = sizeof(TonemapOpenCLContext),

> > +    .priv_class     = &tonemap_opencl_class,

> > +    .init           = &ff_opencl_filter_init,

> > +    .uninit         = &tonemap_opencl_uninit,

> > +    .query_formats  = &ff_opencl_filter_query_formats,

> > +    .inputs         = tonemap_opencl_inputs,

> > +    .outputs        = tonemap_opencl_outputs,

> > +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,

> > +};

> >

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Ruiling Song May 22, 2018, 8:56 a.m. UTC | #6
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Niklas Haas

> Sent: Tuesday, May 22, 2018 10:28 AM

> To: ffmpeg-devel@ffmpeg.org

> Cc: Mark Thompson <sw@jkqxz.net>

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> 

> On Tue, 22 May 2018 01:18:30 +0100, Mark Thompson <sw@jkqxz.net> wrote:

> > On 21/05/18 07:50, Ruiling Song wrote:

> > > This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.

> > >

> > > An example command to use this filter with vaapi codecs:

> > > FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \

> > > opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format

> \

> > > vaapi -i INPUT -filter_hw_device ocl -filter_complex \

> > > '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1];

> \

> > > [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2

> OUTPUT

> > >

> > > Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> > > ---

> >

> > I assume you're testing with Beignet for this sort of mapping to work?  I tried it

> with Beignet on Coffee Lake with 10-bit videos and it looks sensible, though it is

> rather hard to tell whether it is in some sense "correct".

> 

> It's also rather hard to define whether it is in some sense "correct".

> The methodology employed here is generally based on ITU-R

> recommendations, however the ITU-R advises multiple possible ways of

> doing tone-mapping. They also highlight their own curve function, which

> we don't use (for performance/simplicity reasons - iirc I gave it a try

> and the result was not visually dissimilar enough from the hable

> function, but my memory could be wrong). There's nothing resembling an

> official "standard" way to tone-map defined by the ITU-R.

> 

> This algorithm is also generally based on the results obtained from the

> "official" ACES implementation of HDR->SDR tone mapping (obtainable

> here: https://github.com/ampas/aces-dev), with the key difference that

> we do chroma-invariant tone mapping whereas hollywood tends to use

> channel-independent tone mapping. I think the latter distorts the colors

> too much for taste and generally results in a worse looking image. The

> only important bit to make chroma-invariant tone mapping work well,

> however, is the need for a good desaturation algorithm. This one is

> based on original research and experimentation. The desaturation

> strength with a parameter of 1.0 is comparable to the one achieved by

> the ACES algorithm, although I pick a lower strength by default (0.5),

> because I found it too damaging for some types of sources (particularly

> bright skies) as a result of the chroma-invariant nature.

> 

> In addition to the desaturation step, the other key innovation which I

> cannot find mentioned in ITU-R literature is the importance of adjusting

> the overall average brightness before tone mapping. I suspect the reason

> this isn't considered by the ITU-R is because the ITU-R assumes that HDR

> sources actually follow their standards, which in practice none seem to

> do. In theory, HDR material isn't supposed to have a significantly

> higher average brightness than SDR material. Apart from the addition of

> isolated super-highlights, nothing should have changed about the image

> appearance. In practice, HDR directors like to point their expensive

> cameras at very bright objects (e.g. the sun) and blind viewers' eyes by

> failing to adjust the brightness during the mastering step. Our

> algorithm compensates for this by essentially "correcting" the bad

> mastering in real-time. [1] Of course, the result here is not as good as

> doing it ahead of time by a human, but unfortunately we don't have a say

> in this matter.

> 

> As long as the implementation is correct, I'd be confident in assuming

> that this produces pleasurable results for all the sources I've thrown

> at it, often even exceeding in quality the "official" SDR-mapped blu-ray

> versions of the same sources on the same scenes. (Partially due to the

> preserved higher color gamut)

> 

> In order to ascertain whether or not the implementation is correct, you

> could compare it to results obtained by latest `mpv` (might need git

> master) or `libplacebo`, both of which implement the same algorithm.

> 

> 

> [1] The algorithm I use in mpv and libplacebo does this with one frame

> of latency, because I don't want to round-trip through an intermediate

> buffer in my processing chain, and there's no other way to communicate

> back the measured frame statistics to the rest of the kernels in

> OpenGL/Vulkan land. I do this because of my realtime requirements as

> well as the structure of my processing chain.

> 

> Since you are implementing an offline filter and neither of these

> restrictions apply to you, I would recommend changing the implementation

> to separate the peak measurement step from the tone mapping step, so

> that the former completes first and then the second runs from scratch

> and can use the results computed by the former in the same frame.

Yes, your idea sounds reasonable. But it may need much effort to re-structure the code to make it (that would launch two kernels, and we may need a wait between them) and evaluate the performance.
Although we are developing offline filter, I think that performance is still very important as well as quality.
Given that the current implementation does well for video transcoding. I would leave it in my TODO list. Sounds ok?

> 

> If you don't do this, you run the risk of failing to tone map single

> frame data (e.g. screenshots), because no data about the previous frame

> is available at the time.

> 

> > > +// detect peak/average signal of a frame, the algorithm was ported from:

> > > +// libplacebo (https://github.com/haasn/libplacebo)

> > > +struct detection_result

> > > +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,

> > > +            float signal, float peak) {

> > > +    global uint *avg_buf = util_buf;

> > > +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;

> > > +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;

> > > +    global uint *max_total_p = counter_wg_p + 1;

> > > +    global uint *avg_total_p = max_total_p + 1;

> > > +    global uint *frame_idx_p = avg_total_p + 1;

> > > +    global uint *scene_frame_num_p = frame_idx_p + 1;

> > > +

> > > +    uint frame_idx = *frame_idx_p;

> > > +    uint scene_frame_num = *scene_frame_num_p;

> > > +

> > > +    size_t lidx = get_local_id(0);

> > > +    size_t lidy = get_local_id(1);

> > > +    size_t lsizex = get_local_size(0);

> > > +    size_t lsizey = get_local_size(1);

> > > +    uint num_wg = get_num_groups(0) * get_num_groups(1);

> > > +    size_t group_idx = get_group_id(0);

> > > +    size_t group_idy = get_group_id(1);

> > > +    struct detection_result r = {peak, sdr_avg};

> > > +    *sum_wg = 0;

> >

> > This is technically a data race - maybe set it in only the first workitem?

> 

> I'm not sure where the data race is here. There's a barrier immediately

> below it, which ensures that all of the *sum_wg writes must complete

> before progressing further, no? So even though all of the threads conflict

> in their write to *sum_wg, they all write the same thing and wait for

> each other before continuing.

> 

> >

> > > +    barrier(CLK_LOCAL_MEM_FENCE);

> > > +

> > > +    // update workgroup sum

> > > +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));

> >

> > I think the numbers you're adding together here sum to at most something like

> 16 * 16 * 100 * 1023?  Can you make sure this can't overflow and add a

> comment on that.

> 

> It's not * 1023, the highest possible peak in practice is 100.0 (PQ's

> peak brightness). So the limit per workgroup is 16 * 16 * 10000,

> requiring 22 bits to not overflow on a pathological input.

> 

> >

> > > +    barrier(CLK_LOCAL_MEM_FENCE);

> > > +

> > > +    // update frame peak/avg using work-group-average.

> > > +    if (lidx == 0 && lidy == 0) {

> > > +        uint avg_wg = *sum_wg / (lsizex * lsizey);

> > > +        atomic_max(&peak_buf[frame_idx], avg_wg);

> > > +        atomic_add(&avg_buf[frame_idx], avg_wg);

> >

> > Similarly this one?  (width/16 * height/16 * 100 * 1023, I think, which might

> overflow for 8K?)

> 

> For 8K it's 8192/16 * 4320/16 * 10000, requiring 31 bits to store

> without theoretical risk of overflow.

> 

> And actually, there is a third source of overflow worth investigating,

> namely the *avg_total_p variable, since this accumulates across frames.

> It stores a value of 10000 * (PEAK_DETECTION_FRAMES+1). In practice,

> however, this shouldn't cause any issues for typical buffer sizes.

> (Needing 20 bits for a buffer size of 100).

> 

> Note: In practice, none of these considerations are that worth worrying

> about, since the average illumination of a scene is generally around at

> most 50, so it's more like 23 bits needed to store a typical scene

> rather than the 31 worst case I calculated earlier. The only scenario in

> which I could imagine a worst case like that occurring in normal content

> is if some mastering engineer mistakenly implements a "fade to white" by

> fading to the highest possible HDR peak, and this were to somehow

> survive being reviewed by other humans who presumably have functioning

> retinas that would be screaming in pain as their displays blasted 10000

> cd/m² during the fade.

> 

> > > +    // de-saturate

> > > +    if (desat_param > 0.0f) {

> > > +        float luma = get_luma_dst(rgb);

> > > +        float base = 0.18f * dst_peak;

> >

> > Magic number might want some explaination.

> 

> It is derived from experimentation and visual comparisons with e.g. the

> ACES algorithm. There is no theoretical basis for it.

> 

> > +float3 ootf_hlg(float3 c) {

> > +    float luma = get_luma_src(c);

> > +    // assume a reference display with 1000 nits peak

> > +    float factor = 1000.0f / REFERENCE_WHITE * pow(luma, 0.2f) / pow(12.0f,

> 1.2f);

> > +    return c * factor;

> > +}

> > +

> > +float3 inverse_ootf_hlg(float3 c) {

> > +    // assume a reference display with 1000 nits peak

> > +    c *=  pow(12.0f, 1.2f) / (1000.0f / REFERENCE_WHITE);

> > +    c /= pow(get_luma_dst(c), 0.2f / 1.2f);

> > +    return c;

> > +}

> 

> I would recommend parametrizing these by the peak variable. When you

> tone map from HLG to HLG at a lower peak, the inverse OOTF call needs to

> use the new peak. (You should also update the peak tagging in the

> frame's side channel data, not sure if you do).

Are you talking about display-referred HLG? I didn't update frame side channel data.
I am not sure when do I need to update it. I thought all HLG should be scene-referred, seems not?
Could you tell me more about display-referred HLG?
I don't find anything about it. What metadata in HEVC indicate display-referred?
Any display-referred HLG video sample?

Thanks for your comment.
Ruiling
> 

> Ditto, for the forwards OOTF, the `peak` needs to match the value you

> assume for the src sig peak down below. You have it hard-coded as 12.0

> for HLG, which is the correct peak in scene-referred space, but that

> doesn't necessarily need to match the display referred case, which is

> what's relevant for tone mapping. If you tune the OOTF for a 1000 nits

> peak display, the source peak after applying the OOTF would be 10.0, not

> 12.0. Alternatively, you could tune the OOTF for 1200 nits instead.

> (This is what libplacebo does, although I think not intentionally. I'll

> change it to use 1000 nits as well.)

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Mark Thompson May 22, 2018, 12:41 p.m. UTC | #7
On 22/05/18 09:48, Song, Ruiling wrote:
>> -----Original Message-----
>> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
>> Mark Thompson
>> Sent: Tuesday, May 22, 2018 8:19 AM
>> To: ffmpeg-devel@ffmpeg.org
>> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.
>>
>> On 21/05/18 07:50, Ruiling Song wrote:
>>> This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
>>>
>>> An example command to use this filter with vaapi codecs:
>>> FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
>>> opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
>>> vaapi -i INPUT -filter_hw_device ocl -filter_complex \
>>> '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
>>> [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2
>> OUTPUT
>>>
>>> Signed-off-by: Ruiling Song <ruiling.song@intel.com>
>>> ---
>>
>> ...
>>
>>
>> On Mali:
>>
>> $ ./ffmpeg_g -v 55 -y -i ~/test/The\ World\ in\ HDR.mkv -init_hw_device opencl
>> -filter_hw_device opencl0 -an -vf
>> 'format=p010,hwupload,tonemap_opencl=t=bt2020:tonemap=linear:format=p0
>> 10,hwdownload,format=p010' -c:v libx264 out.mp4
>> ...
>> [tonemap_opencl @ 0x8201d7c0] Filter input: opencl, 3840x2160 (0).
>> [Parsed_tonemap_opencl_2 @ 0x8201d760] Failed to enqueue kernel: -5.
> The error seems map to OpenCL error CL_OUT_OF_RESOURCES. I don't have any idea yet.
> May be some limitation in the driver not queried?
> 
>>
>> That's an RK3288 with a Mali T760, clinfo: <https://0x0.st/se5r.txt>, full log:
>> <https://0x0.st/se5s.log>.
>>
>> (The Rockchip hardware decoder can do H.265 Main 10, but the output format
>> isn't P010 so it's easier to use VP9 here.)
> Not p010? Then which format? Planar?

It's two-plane like P010, but with the samples packed together to minimise the memory use - I think it uses all the bits to give you four (4 x 10 = 40 bits) luma samples (or two times two component chroma samples) in each five bytes (5 x 8 = 40 bits).  This form probably isn't usable directly by anything generic like OpenCL without more magic, though Rockchip's KMS and related processing code can handle it.

> And I don't quite understand here. What the relationship of format with VP9?

Oh, sorry - that's coming from the mostly-unrelated point that VP9 has much better software decode support in libavcodec.  Irrelevant, really - H.265 will also work.


>>> ...
>>> +
>>> +// detect peak/average signal of a frame, the algorithm was ported from:
>>> +// libplacebo (https://github.com/haasn/libplacebo)
>>> +struct detection_result
>>> +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
>>> +            float signal, float peak) {
>>> +    global uint *avg_buf = util_buf;
>>> +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
>>> +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
>>> +    global uint *max_total_p = counter_wg_p + 1;
>>> +    global uint *avg_total_p = max_total_p + 1;
>>> +    global uint *frame_idx_p = avg_total_p + 1;
>>> +    global uint *scene_frame_num_p = frame_idx_p + 1;
>>> +
>>> +    uint frame_idx = *frame_idx_p;
>>> +    uint scene_frame_num = *scene_frame_num_p;
>>> +
>>> +    size_t lidx = get_local_id(0);
>>> +    size_t lidy = get_local_id(1);
>>> +    size_t lsizex = get_local_size(0);
>>> +    size_t lsizey = get_local_size(1);
>>> +    uint num_wg = get_num_groups(0) * get_num_groups(1);
>>> +    size_t group_idx = get_group_id(0);
>>> +    size_t group_idy = get_group_id(1);
>>> +    struct detection_result r = {peak, sdr_avg};
>>> +    *sum_wg = 0;
>>
>> This is technically a data race - maybe set it in only the first workitem?
> When writing same value to it, this may be fine, we should still get correct result.
> But I agree it is better to only ask the first work-item to do the initialization.

C/C++ make it undefined behaviour, so even if when it's benign like this (writing the same value) I would prefer to avoid it.

>>> +    barrier(CLK_LOCAL_MEM_FENCE);
>>> +
>>> +    // update workgroup sum
>>> +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));
>>
>> I think the numbers you're adding together here sum to at most something like
>> 16 * 16 * 100 * 1023?  Can you make sure this can't overflow and add a
>> comment on that.
> Niklas also pointed this out. It is 16 * 16 * 10000 at max. so, no overflow here.
> 
>>
>>> +    barrier(CLK_LOCAL_MEM_FENCE);
>>> +
>>> +    // update frame peak/avg using work-group-average.
>>> +    if (lidx == 0 && lidy == 0) {
>>> +        uint avg_wg = *sum_wg / (lsizex * lsizey);
>>> +        atomic_max(&peak_buf[frame_idx], avg_wg);
>>> +        atomic_add(&avg_buf[frame_idx], avg_wg);
>>
>> Similarly this one?  (width/16 * height/16 * 100 * 1023, I think, which might
>> overflow for 8K?)
>>
>>> +    }
>>> +
>>> +    if (scene_frame_num > 0) {
>>> +        float peak = (float)*max_total_p / (REFERENCE_WHITE *
>> scene_frame_num);
>>> +        float avg = (float)*avg_total_p / (REFERENCE_WHITE *
>> scene_frame_num);
>>> +        r.peak = max(1.0f, peak);
>>> +        r.average = max(0.25f, avg);
>>
>> fmax()?  (max() is an integer function, not sure what it does to 0.25f.)
> min()/max() also accept floating point values. You can refer chapter "6.12.4 Common Functions" in OpenCL Spec 1.2

Huh, you're right; that's fine then.  (Still confusing - fmax() is defined in 6.12.2, then max() for integers in 6.12.3, then separately max() for floats in 6.12.4.)

>>> ...
>>> +static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>>> +{
>>> +    AVFilterContext    *avctx = inlink->dst;
>>> +    AVFilterLink     *outlink = avctx->outputs[0];
>>> +    TonemapOpenCLContext *ctx = avctx->priv;
>>> +    AVFrame *output = NULL;
>>> +    cl_int cle;
>>> +    int err;
>>> +    double peak = ctx->peak;
>>> +
>>> +    AVHWFramesContext *input_frames_ctx =
>>> +        (AVHWFramesContext*)input->hw_frames_ctx->data;
>>> +
>>> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
>>> +           av_get_pix_fmt_name(input->format),
>>> +           input->width, input->height, input->pts);
>>> +
>>> +    if (!input->hw_frames_ctx)
>>> +        return AVERROR(EINVAL);
>>> +
>>> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
>>> +    if (!output) {
>>> +        err = AVERROR(ENOMEM);
>>> +        goto fail;
>>> +    }
>>> +
>>> +    err = av_frame_copy_props(output, input);
>>> +    if (err < 0)
>>> +        goto fail;

av_frame_copy_props() copies the side-data which will include the mastering/light-level information, but that's no longer valid after tonemapping?

>>> +
>>> +    if (!peak)
>>> +        peak = determine_signal_peak(input);
>>> +
>>> +    if (ctx->trc != -1)
>>> +        output->color_trc = ctx->trc;
>>> +    if (ctx->primaries != -1)
>>> +        output->color_primaries = ctx->primaries;
>>> +    if (ctx->colorspace != -1)
>>> +        output->colorspace = ctx->colorspace;
>>> +    if (ctx->range != -1)
>>> +        output->color_range = ctx->range;
>>> +
>>> +    ctx->trc_in = input->color_trc;
>>> +    ctx->trc_out = output->color_trc;
>>> +    ctx->colorspace_in = input->colorspace;
>>> +    ctx->colorspace_out = output->colorspace;
>>> +    ctx->primaries_in = input->color_primaries;
>>> +    ctx->primaries_out = output->color_primaries;
>>> +    ctx->range_in = input->color_range;
>>> +    ctx->range_out = output->color_range;
>>> +
>>> +    if (!ctx->initialised) {
>>> +        err = tonemap_opencl_init(avctx);
>>> +        if (err < 0)
>>> +            goto fail;
>>> +    }
>>> +
>>> +    switch(input_frames_ctx->sw_format) {
>>> +    case AV_PIX_FMT_P010:
>>> +        err = launch_kernel(avctx, ctx->kernel, output, input, peak);
>>> +        if (err < 0) goto fail;
>>> +        break;
>>> +    default:
>>> +        av_log(ctx, AV_LOG_ERROR, "unsupported format in
>> tonemap_opencl.\n");
>>> +        err = AVERROR(ENOSYS);
>>> +        goto fail;
>>> +    }
>>> +
>>> +    cle = clFinish(ctx->command_queue);
>>> +    if (cle != CL_SUCCESS) {
>>> +        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",
>>> +               cle);
>>> +        err = AVERROR(EIO);
>>> +        goto fail;
>>> +    }
>>
>> It might be nice to add some debug output here showing the what
>> transformation was actually applied and maybe some of the persistent
>> parameters from util_buf (they would be easier to verify as sensible).
> I am not quite sure on this. What kind of message is preferred? Any specific idea?

I was thinking of the colour parameters and the peak/average values which actually got used for that frame.  It would be much easier to tell externally what is going on with that information, which would help with checking results.  (To be clear, I'm only thinking of this for debugging.)

Thanks,

- Mark
Niklas Haas May 22, 2018, 12:54 p.m. UTC | #8
On Tue, 22 May 2018 08:56:37 +0000, "Song, Ruiling" <ruiling.song@intel.com> wrote:
> Yes, your idea sounds reasonable. But it may need much effort to re-structure the code to make it (that would launch two kernels, and we may need a wait between them) and evaluate the performance.

Actually, a brute force solution to solve the missing peak problem would
be to filter the first frame twice and discard the first result. (After
that, you only need to filter each frame once, so the overall
performance characteristic is unchanged for videos)

That requires minimal code change, and it still allows it to work for
single-frame video sources. It also prevents an initial flash of the
wrong brightness level for transcoded videos.

Also, performnace wise, I'm not sure how this works in OpenCL land, but
in OpenGL/Vulkan, you'd just need to emit a pipeline barrier. That
allows the kernels to synchronize without having to stall the pipeline
by doing a CPU wait. (And, in general, you'd need a pipeline barrier
even if you *are* running glFinish() afterwards - the pipeline barrier
isn't just for timing, it's also for flushing the appropriate caches. In
general, write visibility on storage buffers requires a pipeline
barrier. Are you sure this is not the case for OpenCL as well?)

> Although we are developing offline filter, I think that performance is still very important as well as quality.
> Given that the current implementation does well for video transcoding. I would leave it in my TODO list. Sounds ok?

ACK. It's not my decision, I'm just offering advice.

> Are you talking about display-referred HLG? I didn't update frame side channel data.
> I am not sure when do I need to update it. I thought all HLG should be scene-referred, seems not?
> Could you tell me more about display-referred HLG?

There's no such thing as "display-referred HLG". HLG by definition is
encoded as scene-referred, but the OOTF to convert from scene-referred
to display-referred is part of the EOTF (also by definition).

So the HLG EOTF inputs scene-referred and outputs display-referred. When
you apply the EOTF (including the OOTF) as part of your processing
chain, you're turning it into a linear light display referred signal.
The tone mapping then happens on this signal (in display light), and
then to turn it back to HLG after you're done tone-mapping you apply the
inverse OOTF + OETF, thus turning it back into scene referred light.

The HLG OOTF (and therefore the EOTF) is parametrized by the display
peak. Even though the HLG signal is stored in the range 0.0 - 12.0
(scene referred), the output range depends on how you tuned the EOTF. If
you tuned it for the 1000 cd/m^2 reference display, then an input of
12.0 will get turned into an output value of 1000 cd/m^2.

If we then tone-map this to a brightness of 500 cd/m^2, and pass it back
through the same OOTF, it would get turned into 6.0 rather than the
12.0. While this may ultimately reproduce the correct result on-screen
(assuming the end user of the video file also uses a peak of 1000 cd/m^2
to decode the file), it's a suboptimal use of the encoding range and
also not how HLG is designed to operate. (For example, it would affect
the "SDR backwards compatibility" property of HLG, which is the whole
reason for the peak-dependent encoding)

That's why the correct thing to do would be to re-encode the file using
an inverse OOTF tuned for 500 cd/m², thus taking our tone mapped value
in question back to the (scene-referred) value of 12.0, and update the
tagged peak to also read 500 cd/m². Now a spec-conforming implementation
of a video player (e.g. mpv or VLC) that plays this file would use the
same tuned EOTF to decode it back to the value of 500 cd/m², thus
ensuring it round trips correctly.

> I don't find anything about it. What metadata in HEVC indicate display-referred?
> Any display-referred HLG video sample?

As mentioned, the HLG EOTF by definition requires transforming to
display-referred space. The mastering display metadata *is* what
describes how this (definitively display-referred) space behaves. So
when decoding HLG, you use the tagged mastering metadata's peak as the
parametrization for the EOTF. (This is what e.g. mpv and VLC do)

For a better explanation of this (admittedly confusing) topic, see Annex
1 of ITU-R Recommendation BT.2100.

Here is a relevant excerpt: http://0x0.st/se7O.png
Ruiling Song May 23, 2018, 5:47 a.m. UTC | #9
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Niklas Haas

> Sent: Tuesday, May 22, 2018 8:54 PM

> To: Song, Ruiling <ruiling.song@intel.com>

> Cc: Mark Thompson <sw@jkqxz.net>; FFmpeg development discussions and

> patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> 

> On Tue, 22 May 2018 08:56:37 +0000, "Song, Ruiling" <ruiling.song@intel.com>

> wrote:

> > Yes, your idea sounds reasonable. But it may need much effort to re-structure

> the code to make it (that would launch two kernels, and we may need a wait

> between them) and evaluate the performance.

> 

> Actually, a brute force solution to solve the missing peak problem would

> be to filter the first frame twice and discard the first result. (After

> that, you only need to filter each frame once, so the overall

> performance characteristic is unchanged for videos)

> 

> That requires minimal code change, and it still allows it to work for

> single-frame video sources. It also prevents an initial flash of the

> wrong brightness level for transcoded videos.

For the single frame video, do you mean still image?
I am not sure whether current OpenCL acceleration well designed for that?
My feeling is that people mainly use OpenCL for video acceleration,
esp. interop with hardware-accelerated codecs. Welcome to correct me on this.

For the very first frame, I think it is not easy to notice a flash.
Because a default peak value was used for the first frame which is 100 for PQ,
we would just get the first frame a little dimmer.

> 

> Also, performnace wise, I'm not sure how this works in OpenCL land, but

> in OpenGL/Vulkan, you'd just need to emit a pipeline barrier. That

> allows the kernels to synchronize without having to stall the pipeline

> by doing a CPU wait. (And, in general, you'd need a pipeline barrier

> even if you *are* running glFinish() afterwards - the pipeline barrier

> isn't just for timing, it's also for flushing the appropriate caches. In

> general, write visibility on storage buffers requires a pipeline

> barrier. Are you sure this is not the case for OpenCL as well?)

I think it again, the two OpenCL kernel launch needs no wait. It is just two kernel launched from host.
The performance I said is we need to read the image twice, which is obviously not as efficient as read once.
> 

> > Although we are developing offline filter, I think that performance is still very

> important as well as quality.

> > Given that the current implementation does well for video transcoding. I

> would leave it in my TODO list. Sounds ok?

> 

> ACK. It's not my decision, I'm just offering advice.

> 

> > Are you talking about display-referred HLG? I didn't update frame side channel

> data.

> > I am not sure when do I need to update it. I thought all HLG should be scene-

> referred, seems not?

> > Could you tell me more about display-referred HLG?

> 

> There's no such thing as "display-referred HLG". HLG by definition is

> encoded as scene-referred, but the OOTF to convert from scene-referred

> to display-referred is part of the EOTF (also by definition).

> 

> So the HLG EOTF inputs scene-referred and outputs display-referred. When

> you apply the EOTF (including the OOTF) as part of your processing

> chain, you're turning it into a linear light display referred signal.

> The tone mapping then happens on this signal (in display light), and

> then to turn it back to HLG after you're done tone-mapping you apply the

> inverse OOTF + OETF, thus turning it back into scene referred light.

> 

> The HLG OOTF (and therefore the EOTF) is parametrized by the display

> peak. Even though the HLG signal is stored in the range 0.0 - 12.0

> (scene referred), the output range depends on how you tuned the EOTF. If

> you tuned it for the 1000 cd/m^2 reference display, then an input of

> 12.0 will get turned into an output value of 1000 cd/m^2.

> 

> If we then tone-map this to a brightness of 500 cd/m^2, and pass it back

> through the same OOTF, it would get turned into 6.0 rather than the

> 12.0. While this may ultimately reproduce the correct result on-screen

> (assuming the end user of the video file also uses a peak of 1000 cd/m^2

> to decode the file), it's a suboptimal use of the encoding range and

> also not how HLG is designed to operate. (For example, it would affect

> the "SDR backwards compatibility" property of HLG, which is the whole

> reason for the peak-dependent encoding)

> 

> That's why the correct thing to do would be to re-encode the file using

> an inverse OOTF tuned for 500 cd/m², thus taking our tone mapped value

> in question back to the (scene-referred) value of 12.0, and update the

> tagged peak to also read 500 cd/m². Now a spec-conforming implementation

> of a video player (e.g. mpv or VLC) that plays this file would use the

> same tuned EOTF to decode it back to the value of 500 cd/m², thus

> ensuring it round trips correctly.

> 

> > I don't find anything about it. What metadata in HEVC indicate display-referred?

> > Any display-referred HLG video sample?

> 

> As mentioned, the HLG EOTF by definition requires transforming to

> display-referred space. The mastering display metadata *is* what

> describes how this (definitively display-referred) space behaves. So

> when decoding HLG, you use the tagged mastering metadata's peak as the

> parametrization for the EOTF. (This is what e.g. mpv and VLC do)

> 

> For a better explanation of this (admittedly confusing) topic, see Annex

> 1 of ITU-R Recommendation BT.2100.

Excellent explanation. I think I get your idea. Will refine the code per your suggestion.
But still some question, will people/tools tend to fill in the mastering information for HLG video?
I currently see no document that recommend to fill the mastering display for HLG.
I only have one HLG sample download from 4kmedia.org. seems it has no mastering metadata.
Do you have any more HLG videos that show it will often be filled in?
My concern here is will all video players correctly parse the mastering display metadata to decode HLG, or just skip it because most HLG video has no metadata?
As what I do now is tone mapping from HDR to SDR, do you think it is meaningful to add the metadata for SDR video?
And looks like using a peak of 100 in inverse_ootf() when tone-mapping to sdr is just ok?

Thanks again for your kinder advice and suggestion!

Ruiling
> 

> Here is a relevant excerpt: http://0x0.st/se7O.png

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Niklas Haas May 23, 2018, 11:26 a.m. UTC | #10
> Excellent explanation. I think I get your idea. Will refine the code per your suggestion.
> But still some question, will people/tools tend to fill in the mastering information for HLG video?
> I currently see no document that recommend to fill the mastering display for HLG.
> I only have one HLG sample download from 4kmedia.org. seems it has no mastering metadata.
> Do you have any more HLG videos that show it will often be filled in?
> My concern here is will all video players correctly parse the mastering display metadata to decode HLG, or just skip it because most HLG video has no metadata?

I think there's probably going to be three ways to approach this
situation. Part of the problem is surely the fact that HLG is sort of
designed to be "implicitly" tone-mapped. That is, the way the HLG
standard is written, you'd just always encode things so that 12.0 is the
brightest peak brightness, and a user with a 500 cd/m² peak TV would
just apply the HLG OOTF tuned for 500 cd/m² on the original signal as
received from the (e.g. blu-ray) source. Sure, the mastering engineer
may have used a 1500 cd/m² screen to master it, but since the HLG
OOTF-OOTF round-trip essentially constitutes a simple form of
tone-mapping, the overall result on-screen will look more or less
reasonable. (Certainly more reasonable than e.g. PQ)

So surely there's the camp of people that believe HLG doesn't need
mastering metadata and will therefore not include it, because the end
result without metadata looks more or less good enough. However, I
disagree with this result. First of all, it prevents color-accurate
round-trips. The HLG OOTF is inherently color-distorting, so in a
color-managed workflow with calibrated devices, this methodology will
not be sufficient to ensure perceptually accurate reproduction. The
second reason is that as I said, the HLG OOTF-OOTF interaction
essentially constitutes a simple form of tone-mapping; but we can do
significantly better. I believe our tone mapping algorithm produces a
far better result (visually) than applying the HLG OOTF as-is,
especially when going to an SDR display. (If you're using mpv, you can
test this by playing a HLG source once with --vf=format:peak=10 and once
with --vf=format:peak=1. In the latter case, the only tone mapping being
done is the implicit HLG tone mapping). Not only are HLG sources I've
found inconsistently encoded, but also I find that the inherent HLG
tone-mapping tends to over-saturate the signal (similar to the result we
get if the desaturation strength is 0.0) while also boosting the gamma.

So if we subscribe to the idea that we need metadata to do
color-accurate tone mapping and reproduction, then the question becomes:
what do we do for un-tagged sources? The obvious approach is to assume a
(display-referred) signal peak of 10.0 (thus corresponding to a display
peak of 1000 cd/m², i.e. the HLG reference device). But I think if I was
to make a HLG release of my own, I would definitely try and include the
most accurate tagging possible. For example, if we have a clip available
in both PQ and HLG, I would use the PQ version's mastering metadata for
HLG as well.

Finally, to put the nail in the coffin of the idea that HLG doesn't need
metadata, we should realize that the mastering metadata isn't just there
to help you tone map the brightness levels, it also includes the
display's gamut capabilities - and for a good reason. When doing
desaturation in order to fit the BT.2020 signal into a (typically far
less than BT.2020) display response, knowing the gamut limitations of
the signal can similarly help users do a far better job than having to
assume the worst case scenario - for much the same reason that knowing
the signal's actual peak brightness can help users do a far better job
tone-mapping than having to assume a worst-case peak of 10,000 cd/m².
Indeed, in the best case scenario (your own display's gamut and
brightness capabilities match or exceed the mastering display's), both
of these can just be no-ops.

So if mastering metadata is beneficial at all, then we should also agree
that mastering metadata is beneficial to BT.2020 + HLG sources, simply
for the gamut data alone. The fact that HLG is ill-defined without
knowing the mastering display's brightness is just icing on the cake at
this point.

> As what I do now is tone mapping from HDR to SDR, do you think it is meaningful to add the metadata for SDR video?

The mastering metadata is still useful for the gamut information as
explained. Since you're (most likely) encoding a BT.2020 signal that
doesn't use the full gamut range of BT.2020, even for SDR curves it can
be a good idea to preserve it.

> And looks like using a peak of 100 in inverse_ootf() when tone-mapping to sdr is just ok?

Sure. That won't blow up, but using HLG to store an SDR signal is sort
of wasteful/pointless. Might as well use an actual SDR curve and skip
the inverse_ootf step altogether.

> 
> Thanks again for your kinder advice and suggestion!
Niklas Haas May 23, 2018, 3:51 p.m. UTC | #11
On Mon, 21 May 2018 14:50:17 +0800, Ruiling Song <ruiling.song@intel.com> wrote:
> +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
> +    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);
> +    // de-saturate
> +    if (desat_param > 0.0f) {
> +        float luma = get_luma_dst(rgb);
> +        float base = 0.18f * dst_peak;
> +        float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);
> +        coeff = native_powr(coeff, 10.0f / desat_param);
> +        rgb = mix(rgb, (float3)luma, (float3)coeff);
> +        sig = mix(sig, luma, coeff);
> +    }
> +
> +    float sig_old = sig;
> +    float slope = min(1.0f, sdr_avg / average);
> +    sig *= slope;
> +    peak *= slope;
> +
> +    sig = TONE_FUNC(sig, peak);
> +    rgb *= (sig/sig_old);
> +    return rgb;

Actually a better way to do this is to swap the order of the `slope`
adjustment  and the desaturation step. This prevents a problematic case
where very bright (badly mastered) sources ended up getting too
aggressively desaturated.

Some care needs to be taken when swapping the order in order to scale
the multiplication in the correct way. This should work:

  float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
      float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);
      float sig_old = sig;
      float slope = min(1.0f, sdr_avg / average);
      sig *= slope;
      peak *= slope;

      // de-saturate
      if (desat_param > 0.0f) {
          float luma = get_luma_dst(rgb);
          float base = 0.18f * dst_peak;
          float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);
          coeff = native_powr(coeff, 10.0f / desat_param);
          rgb = mix(rgb, (float3)luma, (float3)coeff);
          sig = mix(sig, luma * slope, coeff);
      }

      sig = TONE_FUNC(sig, peak);
      rgb *= (sig/sig_old);
      return rgb;

I found out about this while testing some pathological HLG sources
earlier today.
Ruiling Song May 24, 2018, 8:57 a.m. UTC | #12
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Mark Thompson

> Sent: Tuesday, May 22, 2018 8:41 PM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> 

> On 22/05/18 09:48, Song, Ruiling wrote:

> >> -----Original Message-----

> >> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf

> Of

> >> Mark Thompson

> >> Sent: Tuesday, May 22, 2018 8:19 AM

> >> To: ffmpeg-devel@ffmpeg.org

> >> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> >>

> >> On 21/05/18 07:50, Ruiling Song wrote:

> >>> This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.

> >>>

> >>> An example command to use this filter with vaapi codecs:

> >>> FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \

> >>> opencl=ocl@va -hwaccel vaapi -hwaccel_device va -

> hwaccel_output_format \

> >>> vaapi -i INPUT -filter_hw_device ocl -filter_complex \

> >>> '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1];

> \

> >>> [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2

> >> OUTPUT

> >>>

> >>> Signed-off-by: Ruiling Song <ruiling.song@intel.com>

> >>> ---


> >>> +

> >>> +    err = av_frame_copy_props(output, input);

> >>> +    if (err < 0)

> >>> +        goto fail;

> 

> av_frame_copy_props() copies the side-data which will include the

> mastering/light-level information, but that's no longer valid after tonemapping?

I think so, but I am not sure how to update this information correctly.
Using result peak and result color-space primaries to replace original metadata? Sounds ok?

Ruiling
Ruiling Song May 24, 2018, 8:58 a.m. UTC | #13
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Niklas Haas

> Sent: Wednesday, May 23, 2018 7:27 PM

> To: Song, Ruiling <ruiling.song@intel.com>

> Cc: Mark Thompson <sw@jkqxz.net>; FFmpeg development discussions and

> patches <ffmpeg-devel@ffmpeg.org>

> Subject: Re: [FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

> 

> > Excellent explanation. I think I get your idea. Will refine the code per your

> suggestion.

> > But still some question, will people/tools tend to fill in the mastering

> information for HLG video?

> > I currently see no document that recommend to fill the mastering display for

> HLG.

> > I only have one HLG sample download from 4kmedia.org. seems it has no

> mastering metadata.

> > Do you have any more HLG videos that show it will often be filled in?

> > My concern here is will all video players correctly parse the mastering display

> metadata to decode HLG, or just skip it because most HLG video has no

> metadata?

> 

> I think there's probably going to be three ways to approach this

> situation. Part of the problem is surely the fact that HLG is sort of

> designed to be "implicitly" tone-mapped. That is, the way the HLG

> standard is written, you'd just always encode things so that 12.0 is the

> brightest peak brightness, and a user with a 500 cd/m² peak TV would

> just apply the HLG OOTF tuned for 500 cd/m² on the original signal as

> received from the (e.g. blu-ray) source. Sure, the mastering engineer

> may have used a 1500 cd/m² screen to master it, but since the HLG

> OOTF-OOTF round-trip essentially constitutes a simple form of

> tone-mapping, the overall result on-screen will look more or less

> reasonable. (Certainly more reasonable than e.g. PQ)

> 

> So surely there's the camp of people that believe HLG doesn't need

> mastering metadata and will therefore not include it, because the end

> result without metadata looks more or less good enough. However, I

> disagree with this result. First of all, it prevents color-accurate

> round-trips. The HLG OOTF is inherently color-distorting, so in a

> color-managed workflow with calibrated devices, this methodology will

> not be sufficient to ensure perceptually accurate reproduction. The

> second reason is that as I said, the HLG OOTF-OOTF interaction

> essentially constitutes a simple form of tone-mapping; but we can do

> significantly better. I believe our tone mapping algorithm produces a

> far better result (visually) than applying the HLG OOTF as-is,

> especially when going to an SDR display. (If you're using mpv, you can

> test this by playing a HLG source once with --vf=format:peak=10 and once

> with --vf=format:peak=1. In the latter case, the only tone mapping being

> done is the implicit HLG tone mapping). Not only are HLG sources I've

> found inconsistently encoded, but also I find that the inherent HLG

> tone-mapping tends to over-saturate the signal (similar to the result we

> get if the desaturation strength is 0.0) while also boosting the gamma.

I agree with you. Thanks for detailed explanation.

> 

> So if we subscribe to the idea that we need metadata to do

> color-accurate tone mapping and reproduction, then the question becomes:

> what do we do for un-tagged sources? The obvious approach is to assume a

> (display-referred) signal peak of 10.0 (thus corresponding to a display

> peak of 1000 cd/m², i.e. the HLG reference device). But I think if I was

> to make a HLG release of my own, I would definitely try and include the

> most accurate tagging possible. For example, if we have a clip available

> in both PQ and HLG, I would use the PQ version's mastering metadata for

> HLG as well.

Where comes the "1000 cd/m² is the reference display peak"? seems no clear statement in BT2100?
If that is true, my code is wrong to detect peak of untagged source.
    if (!peak)
        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;
so here I should change it from 12.0f to 10.0f?
 
> Finally, to put the nail in the coffin of the idea that HLG doesn't need

> metadata, we should realize that the mastering metadata isn't just there

> to help you tone map the brightness levels, it also includes the

> display's gamut capabilities - and for a good reason. When doing

> desaturation in order to fit the BT.2020 signal into a (typically far

> less than BT.2020) display response, knowing the gamut limitations of

> the signal can similarly help users do a far better job than having to

> assume the worst case scenario - for much the same reason that knowing

> the signal's actual peak brightness can help users do a far better job

> tone-mapping than having to assume a worst-case peak of 10,000 cd/m².

> Indeed, in the best case scenario (your own display's gamut and

> brightness capabilities match or exceed the mastering display's), both

> of these can just be no-ops.

> 

> So if mastering metadata is beneficial at all, then we should also agree

> that mastering metadata is beneficial to BT.2020 + HLG sources, simply

> for the gamut data alone. The fact that HLG is ill-defined without

> knowing the mastering display's brightness is just icing on the cake at

> this point.

> 

> > As what I do now is tone mapping from HDR to SDR, do you think it is

> meaningful to add the metadata for SDR video?

> 

> The mastering metadata is still useful for the gamut information as

> explained. Since you're (most likely) encoding a BT.2020 signal that

> doesn't use the full gamut range of BT.2020, even for SDR curves it can

> be a good idea to preserve it.

> 

> > And looks like using a peak of 100 in inverse_ootf() when tone-mapping to sdr

> is just ok?

> 

> Sure. That won't blow up, but using HLG to store an SDR signal is sort

> of wasteful/pointless. Might as well use an actual SDR curve and skip

> the inverse_ootf step altogether.

Thanks for point this out. I mis-understand the code in libplacebo, because inverse_ootf() was also called if need_ootf is true, which makes I fail to understand it correctly.
My fault. I have fixed it locally to remove the inverse_ootf for SDR curve.

> 

> >

> > Thanks again for your kinder advice and suggestion!

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Niklas Haas May 24, 2018, 10:27 a.m. UTC | #14
On Thu, 24 May 2018 08:58:22 +0000, "Song, Ruiling" <ruiling.song@intel.com> wrote:
> Where comes the "1000 cd/m² is the reference display peak"? seems no clear statement in BT2100?

The concept of there being a "standard" 1000 cd/m² display is introduced
in multiple places. Refer to Table 5 of ITU-R BT.2100, specifically the
section called "HLG Reference EOTF, in particular this definition:

> γ = 1.2 at the nominal display peak luminance of 1 000 cd/m². [5d, 5e, 5f]

And also the Note 5e below it, which explains how to adjust the gamma
if you are displaying on a display with a peak luminance that is
different from 1000 cd/m².

This is pretty much as close to saying that a reference HLG display
should have a peak of 1000 cd/m² as you can get without explicitly
saying it, since that value is essentially the assumption they
hard-coded into their formula.

In addition to this, the ITU-R further reinforces this concept heavily
throughout their ITU-R Report BT.2390, which includes e.g. such
sentences:

> In order to determine the appropriate system gamma for a 1 000 cd/m²
> reference display, NHK conducted a series of experiments with an
> indoor test scene.

So the concept of a “HLG reference display” is not something I made up.
(Incidentally, 1000 cd/m² is also the value you hard-code in your OOTF)

> If that is true, my code is wrong to detect peak of untagged source.
>     if (!peak)
>         peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;
> so here I should change it from 12.0f to 10.0f?

Yes, given that you're defaulting the OOTF to 1000 this is definitely a
good idea, otherwise white won't map to white. (Observe that you have
the scaling factor in your ootf_hlg hard-coded as 1000.0f / REFERENCE_WHITE
= 10.0)

> Thanks for point this out. I mis-understand the code in libplacebo, because inverse_ootf() was also called if need_ootf is true, which makes I fail to understand it correctly.

pl_shader_inverse_ootf() is parametrized by the actual OOTF to use.
Observe that the inverse_ootf() definition in libplacebo is a no-op if
the `enum pl_color_light` is given as PL_COLOR_LIGHT_DISPLAY.
diff mbox

Patch

diff --git a/configure b/configure
index e52f8f8..ee3586b 100755
--- a/configure
+++ b/configure
@@ -3401,6 +3401,7 @@  tinterlace_filter_deps="gpl"
 tinterlace_merge_test_deps="tinterlace_filter"
 tinterlace_pad_test_deps="tinterlace_filter"
 tonemap_filter_deps="const_nan"
+tonemap_opencl_filter_deps="opencl"
 unsharp_opencl_filter_deps="opencl"
 uspp_filter_deps="gpl avcodec"
 vaguedenoiser_filter_deps="gpl"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index c68ef05..0915656 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -352,6 +352,8 @@  OBJS-$(CONFIG_TINTERLACE_FILTER)             += vf_tinterlace.o
 OBJS-$(CONFIG_TLUT2_FILTER)                  += vf_lut2.o framesync.o
 OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
 OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
+OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o colorspace_basic.o opencl.o \
+                                                opencl/tonemap.o opencl/colorspace_basic.o
 OBJS-$(CONFIG_TRANSPOSE_FILTER)              += vf_transpose.o
 OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
 OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index b44093d..6873bab 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -343,6 +343,7 @@  extern AVFilter ff_vf_tinterlace;
 extern AVFilter ff_vf_tlut2;
 extern AVFilter ff_vf_tmix;
 extern AVFilter ff_vf_tonemap;
+extern AVFilter ff_vf_tonemap_opencl;
 extern AVFilter ff_vf_transpose;
 extern AVFilter ff_vf_trim;
 extern AVFilter ff_vf_unpremultiply;
diff --git a/libavfilter/colorspace_basic.c b/libavfilter/colorspace_basic.c
new file mode 100644
index 0000000..93f9f08
--- /dev/null
+++ b/libavfilter/colorspace_basic.c
@@ -0,0 +1,89 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "colorspace_basic.h"
+
+
+void invert_matrix3x3(const double in[3][3], double out[3][3])
+{
+    double m00 = in[0][0], m01 = in[0][1], m02 = in[0][2],
+           m10 = in[1][0], m11 = in[1][1], m12 = in[1][2],
+           m20 = in[2][0], m21 = in[2][1], m22 = in[2][2];
+    int i, j;
+    double det;
+
+    out[0][0] =  (m11 * m22 - m21 * m12);
+    out[0][1] = -(m01 * m22 - m21 * m02);
+    out[0][2] =  (m01 * m12 - m11 * m02);
+    out[1][0] = -(m10 * m22 - m20 * m12);
+    out[1][1] =  (m00 * m22 - m20 * m02);
+    out[1][2] = -(m00 * m12 - m10 * m02);
+    out[2][0] =  (m10 * m21 - m20 * m11);
+    out[2][1] = -(m00 * m21 - m20 * m01);
+    out[2][2] =  (m00 * m11 - m10 * m01);
+
+    det = m00 * out[0][0] + m10 * out[0][1] + m20 * out[0][2];
+    det = 1.0 / det;
+
+    for (i = 0; i < 3; i++) {
+        for (j = 0; j < 3; j++)
+            out[i][j] *= det;
+    }
+}
+
+void mul3x3(double dst[3][3], const double src1[3][3], const double src2[3][3])
+{
+    int m, n;
+
+    for (m = 0; m < 3; m++)
+        for (n = 0; n < 3; n++)
+            dst[m][n] = src2[m][0] * src1[0][n] +
+                        src2[m][1] * src1[1][n] +
+                        src2[m][2] * src1[2][n];
+}
+/*
+ * see e.g. http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
+ */
+void fill_rgb2xyz_table(const struct ColorPrimaries *coeffs,
+                        const struct WhitePoint *wp,
+                        double rgb2xyz[3][3])
+{
+    double i[3][3], sr, sg, sb, zw;
+
+    rgb2xyz[0][0] = coeffs->xr / coeffs->yr;
+    rgb2xyz[0][1] = coeffs->xg / coeffs->yg;
+    rgb2xyz[0][2] = coeffs->xb / coeffs->yb;
+    rgb2xyz[1][0] = rgb2xyz[1][1] = rgb2xyz[1][2] = 1.0;
+    rgb2xyz[2][0] = (1.0 - coeffs->xr - coeffs->yr) / coeffs->yr;
+    rgb2xyz[2][1] = (1.0 - coeffs->xg - coeffs->yg) / coeffs->yg;
+    rgb2xyz[2][2] = (1.0 - coeffs->xb - coeffs->yb) / coeffs->yb;
+    invert_matrix3x3(rgb2xyz, i);
+    zw = 1.0 - wp->xw - wp->yw;
+    sr = i[0][0] * wp->xw + i[0][1] * wp->yw + i[0][2] * zw;
+    sg = i[1][0] * wp->xw + i[1][1] * wp->yw + i[1][2] * zw;
+    sb = i[2][0] * wp->xw + i[2][1] * wp->yw + i[2][2] * zw;
+    rgb2xyz[0][0] *= sr;
+    rgb2xyz[0][1] *= sg;
+    rgb2xyz[0][2] *= sb;
+    rgb2xyz[1][0] *= sr;
+    rgb2xyz[1][1] *= sg;
+    rgb2xyz[1][2] *= sb;
+    rgb2xyz[2][0] *= sr;
+    rgb2xyz[2][1] *= sg;
+    rgb2xyz[2][2] *= sb;
+}
diff --git a/libavfilter/colorspace_basic.h b/libavfilter/colorspace_basic.h
new file mode 100644
index 0000000..5647ca6
--- /dev/null
+++ b/libavfilter/colorspace_basic.h
@@ -0,0 +1,40 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_COLORSPACE_BASIC_H
+#define AVFILTER_COLORSPACE_BASIC_H
+
+#include "libavutil/common.h"
+
+struct LumaCoefficients {
+    double cr, cg, cb;
+};
+
+struct ColorPrimaries {
+    double xr, yr, xg, yg, xb, yb;
+};
+
+struct WhitePoint {
+    double xw, yw;
+};
+
+void invert_matrix3x3(const double in[3][3], double out[3][3]);
+void mul3x3(double dst[3][3], const double src1[3][3], const double src2[3][3]);
+void fill_rgb2xyz_table(const struct ColorPrimaries *coeffs,
+                        const struct WhitePoint *wp, double rgb2xyz[3][3]);
+#endif
diff --git a/libavfilter/opencl/colorspace_basic.cl b/libavfilter/opencl/colorspace_basic.cl
new file mode 100644
index 0000000..ffd98c2
--- /dev/null
+++ b/libavfilter/opencl/colorspace_basic.cl
@@ -0,0 +1,179 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define ST2084_MAX_LUMINANCE 10000.0f
+#define REFERENCE_WHITE 100.0f
+constant const float ST2084_M1 = 0.1593017578125f;
+constant const float ST2084_M2 = 78.84375f;
+constant const float ST2084_C1 = 0.8359375f;
+constant const float ST2084_C2 = 18.8515625f;
+constant const float ST2084_C3 = 18.6875f;
+
+// TODO Move these colorspace matrix to .cpp files
+__constant float yuv2rgb_bt2020[] = {
+    1.0f, 0.0f, 1.4746f,
+    1.0f, -0.16455f, -0.57135f,
+    1.0f, 1.8814f, 0.0f
+};
+
+__constant float yuv2rgb_bt709[] = {
+    1.0f, 0.0f, 1.5748f,
+    1.0f, -0.18732f, -0.46812f,
+    1.0f, 1.8556f, 0.0f
+};
+
+__constant float rgb2yuv_bt709[] = {
+    0.2126f, 0.7152f, 0.0722f,
+    -0.11457f, -0.38543f, 0.5f,
+    0.5f, -0.45415f, -0.04585f
+};
+
+__constant float rgb2yuv_bt2020[] ={
+    0.2627f, 0.678f, 0.0593f,
+    -0.1396f, -0.36037f, 0.5f,
+    0.5f, -0.4598f, -0.0402f,
+};
+
+
+float get_luma_dst(float3 c) {
+    return luma_dst.x * c.x + luma_dst.y * c.y + luma_dst.z * c.z;
+}
+
+float get_luma_src(float3 c) {
+    return luma_src.x * c.x + luma_src.y * c.y + luma_src.z * c.z;
+}
+
+float eotf_st2084(float x) {
+    float p = pow(x, 1.0f / ST2084_M2);
+    float a = max(p -ST2084_C1, 0.0f);
+    float b = max(ST2084_C2 - ST2084_C3 * p, 1e-6f);
+    float c  = pow(a / b, 1.0f / ST2084_M1);
+    return x > 0.0f ? c * ST2084_MAX_LUMINANCE / REFERENCE_WHITE : 0.0f;
+}
+
+__constant const float HLG_A = 0.17883277f;
+__constant const float HLG_B = 0.28466892f;
+__constant const float HLG_C = 0.55991073f;
+
+// linearizer for HLG
+float inverse_oetf_hlg(float x) {
+    float a = 4.0f * x * x;
+    float b = exp((x - HLG_C) / HLG_A) + HLG_B;
+    return x < 0.5f ? a : b;
+}
+
+// delinearizer for HLG
+float oetf_hlg(float x) {
+    float a = 0.5f * sqrt(x);
+    float b = HLG_A * log(x - HLG_B) + HLG_C;
+    return x <= 1.0f ? a : b;
+}
+
+float3 ootf_hlg(float3 c) {
+    float luma = get_luma_src(c);
+    // assume a reference display with 1000 nits peak
+    float factor = 1000.0f / REFERENCE_WHITE * pow(luma, 0.2f) / pow(12.0f, 1.2f);
+    return c * factor;
+}
+
+float3 inverse_ootf_hlg(float3 c) {
+    // assume a reference display with 1000 nits peak
+    c *=  pow(12.0f, 1.2f) / (1000.0f / REFERENCE_WHITE);
+    c /= pow(get_luma_dst(c), 0.2f / 1.2f);
+    return c;
+}
+
+float inverse_eotf_bt1886(float c) {
+    return c < 0.0f ? 0.0f : pow(c, 1.0f / 2.4f);
+}
+
+float oetf_bt709(float c) {
+    c = c < 0.0f ? 0.0f : c;
+    float r1 = 4.5f * c;
+    float r2 = 1.099f * pow(c, 0.45f) - 0.099f;
+    return c < 0.018f ? r1 : r2;
+}
+float inverse_oetf_bt709(float c) {
+    float r1 = c / 4.5f;
+    float r2 = pow((c + 0.099f) / 1.099f, 1.0f / 0.45f);
+    return c < 0.081f ? r1 : r2;
+}
+
+float3 yuv2rgb(float y, float u, float v) {
+#ifdef FULL_RANGE_IN
+    u -= 0.5f; v -= 0.5f;
+#else
+    y = (y * 255.0f -  16.0f) / 219.0f;
+    u = (u * 255.0f - 128.0f) / 224.0f;
+    v = (v * 255.0f - 128.0f) / 224.0f;
+#endif
+    float r = y * rgb_matrix[0] + u * rgb_matrix[1] + v * rgb_matrix[2];
+    float g = y * rgb_matrix[3] + u * rgb_matrix[4] + v * rgb_matrix[5];
+    float b = y * rgb_matrix[6] + u * rgb_matrix[7] + v * rgb_matrix[8];
+    return (float3)(r, g, b);
+}
+
+float3 yuv2lrgb(float3 yuv) {
+    float3 rgb = yuv2rgb(yuv.x, yuv.y, yuv.z);
+    float r = linearize(rgb.x);
+    float g = linearize(rgb.y);
+    float b = linearize(rgb.z);
+    return (float3)(r, g, b);
+}
+
+float3 rgb2yuv(float r, float g, float b) {
+    float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2];
+    float u = r*yuv_matrix[3] + g*yuv_matrix[4] + b*yuv_matrix[5];
+    float v = r*yuv_matrix[6] + g*yuv_matrix[7] + b*yuv_matrix[8];
+#ifdef FULL_RANGE_OUT
+    u += 0.5f; v += 0.5f;
+#else
+    y = (219.0f * y + 16.0f) / 255.0f;
+    u = (224.0f * u + 128.0f) / 255.0f;
+    v = (224.0f * v + 128.0f) / 255.0f;
+#endif
+    return (float3)(y, u, v);
+}
+
+float3 lrgb2yuv(float3 c) {
+    float r = delinearize(c.x);
+    float g = delinearize(c.y);
+    float b = delinearize(c.z);
+
+    return rgb2yuv(r, g, b);
+}
+
+float3 lrgb2lrgb(float3 c) {
+#ifdef RGB2RGB_PASSTHROUGH
+    return c;
+#else
+    float r = c.x, g = c.y, b = c.z;
+    float rr = rgb2rgb[0] * r + rgb2rgb[1] * g + rgb2rgb[2] * b;
+    float gg = rgb2rgb[3] * r + rgb2rgb[4] * g + rgb2rgb[5] * b;
+    float bb = rgb2rgb[6] * r + rgb2rgb[7] * g + rgb2rgb[8] * b;
+    return (float3)(rr, gg, bb);
+#endif
+}
+
+float3 ootf(float3 c) {
+    return ootf_impl(c);
+}
+
+float3 inverse_ootf(float3 c) {
+    return inverse_ootf_impl(c);
+}
diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl
new file mode 100644
index 0000000..03cf3e2
--- /dev/null
+++ b/libavfilter/opencl/tonemap.cl
@@ -0,0 +1,258 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define REFERENCE_WHITE 100.0f
+extern float3 lrgb2yuv(float3);
+extern float3 yuv2lrgb(float3);
+extern float3 lrgb2lrgb(float3);
+extern float get_luma_src(float3);
+extern float get_luma_dst(float3);
+extern float3 ootf(float3);
+extern float3 inverse_ootf(float3);
+struct detection_result {
+    float peak;
+    float average;
+};
+
+float hable_f(float in) {
+    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
+    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
+}
+
+float direct(float s, float peak) {
+    return s;
+}
+
+float linear(float s, float peak) {
+    return s * tone_param / peak;
+}
+
+float gamma(float s, float peak) {
+    float p = s > 0.05f ? s /peak : 0.05f / peak;
+    float v = pow(p, 1.0f / tone_param);
+    return s > 0.05f ? v : (s * v /0.05f);
+}
+
+float clip(float s, float peak) {
+    return clamp(s * tone_param, 0.0f, 1.0f);
+}
+
+float reinhard(float s, float peak) {
+    return s / (s + tone_param) * (peak + tone_param) / peak;
+}
+
+float hable(float s, float peak) {
+    return hable_f(s)/hable_f(peak);
+}
+
+float mobius(float s, float peak) {
+    float j = tone_param;
+    float a, b;
+
+    if (s <= j)
+        return s;
+
+    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
+    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);
+
+    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);
+}
+
+// detect peak/average signal of a frame, the algorithm was ported from:
+// libplacebo (https://github.com/haasn/libplacebo)
+struct detection_result
+detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
+            float signal, float peak) {
+    global uint *avg_buf = util_buf;
+    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
+    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
+    global uint *max_total_p = counter_wg_p + 1;
+    global uint *avg_total_p = max_total_p + 1;
+    global uint *frame_idx_p = avg_total_p + 1;
+    global uint *scene_frame_num_p = frame_idx_p + 1;
+
+    uint frame_idx = *frame_idx_p;
+    uint scene_frame_num = *scene_frame_num_p;
+
+    size_t lidx = get_local_id(0);
+    size_t lidy = get_local_id(1);
+    size_t lsizex = get_local_size(0);
+    size_t lsizey = get_local_size(1);
+    uint num_wg = get_num_groups(0) * get_num_groups(1);
+    size_t group_idx = get_group_id(0);
+    size_t group_idy = get_group_id(1);
+    struct detection_result r = {peak, sdr_avg};
+    *sum_wg = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // update workgroup sum
+    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // update frame peak/avg using work-group-average.
+    if (lidx == 0 && lidy == 0) {
+        uint avg_wg = *sum_wg / (lsizex * lsizey);
+        atomic_max(&peak_buf[frame_idx], avg_wg);
+        atomic_add(&avg_buf[frame_idx], avg_wg);
+    }
+
+    if (scene_frame_num > 0) {
+        float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num);
+        float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num);
+        r.peak = max(1.0f, peak);
+        r.average = max(0.25f, avg);
+    }
+
+    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) {
+        *counter_wg_p = 0;
+        avg_buf[frame_idx] /= num_wg;
+
+        if (scene_threshold > 0.0f) {
+            uint cur_max = peak_buf[frame_idx];
+            uint cur_avg = avg_buf[frame_idx];
+            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;
+
+            if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) {
+                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
+                  avg_buf[i] = 0;
+                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
+                  peak_buf[i] = 0;
+                *avg_total_p = *max_total_p = 0;
+                *scene_frame_num_p = 0;
+                avg_buf[frame_idx] = cur_avg;
+                peak_buf[frame_idx] = cur_max;
+            }
+        }
+        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);
+        // add current frame, subtract next frame
+        *max_total_p += peak_buf[frame_idx] - peak_buf[next];
+        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];
+        // reset next frame
+        peak_buf[next] = avg_buf[next] = 0;
+        *frame_idx_p = next;
+        *scene_frame_num_p = min(*scene_frame_num_p + 1, (uint)DETECTION_FRAMES);
+    }
+    return r;
+}
+
+__constant const float desat_param = 0.5f;
+__constant const float dst_peak    = 1.0f;
+
+float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
+    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);
+    // de-saturate
+    if (desat_param > 0.0f) {
+        float luma = get_luma_dst(rgb);
+        float base = 0.18f * dst_peak;
+        float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);
+        coeff = native_powr(coeff, 10.0f / desat_param);
+        rgb = mix(rgb, (float3)luma, (float3)coeff);
+        sig = mix(sig, luma, coeff);
+    }
+
+    float sig_old = sig;
+    float slope = min(1.0f, sdr_avg / average);
+    sig *= slope;
+    peak *= slope;
+
+    sig = TONE_FUNC(sig, peak);
+    rgb *= (sig/sig_old);
+    return rgb;
+}
+// map from source space YUV to destination space RGB
+float3 map_to_dst_space_from_yuv(float3 yuv) {
+    float3 c = yuv2lrgb(yuv);
+    c = ootf(c);
+    c = lrgb2lrgb(c);
+    return c;
+}
+
+// convert from rgb to yuv, with possible inverse-ootf
+float3 convert_to_yuv(float3 c) {
+    c = inverse_ootf(c);
+    return lrgb2yuv(c);
+}
+
+__kernel void tonemap(__write_only image2d_t dst1,
+                      __write_only image2d_t dst2,
+                      __read_only  image2d_t src1,
+                      __read_only  image2d_t src2,
+#ifdef THIRD_PLANE
+                      __write_only image2d_t dst3,
+                      __read_only  image2d_t src3,
+#endif
+                      global uint *util_buf,
+                      float peak
+                      )
+{
+    __local uint sum_wg;
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+    int xi = get_global_id(0);
+    int yi = get_global_id(1);
+    // each work item process four pixels
+    int x = 2 * xi;
+    int y = 2 * yi;
+
+    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;
+    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;
+    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;
+    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;
+#ifdef THIRD_PLANE
+    float u = read_imagef(src2, sampler, (int2)(xi, yi)).x;
+    float v = read_imagef(src3, sampler, (int2)(xi, yi)).x;
+    float2 uv = (float2)(u, v);
+#else
+    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;
+#endif
+
+    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y));
+    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y));
+    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y));
+    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y));
+
+    float sig0 = max(c0.x, max(c0.y, c0.z));
+    float sig1 = max(c1.x, max(c1.y, c1.z));
+    float sig2 = max(c2.x, max(c2.y, c2.z));
+    float sig3 = max(c3.x, max(c3.y, c3.z));
+    float sig = max(sig0, max(sig1, max(sig2, sig3)));
+
+    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);
+
+    float3 c0_old = c0, c1_old = c1, c2_old = c2;
+    c0 = map_one_pixel_rgb(c0, r.peak, r.average);
+    c1 = map_one_pixel_rgb(c1, r.peak, r.average);
+    c2 = map_one_pixel_rgb(c2, r.peak, r.average);
+    c3 = map_one_pixel_rgb(c3, r.peak, r.average);
+
+    float3 yuv0 = convert_to_yuv(c0);
+    float3 yuv1 = convert_to_yuv(c1);
+    float3 yuv2 = convert_to_yuv(c2);
+    float3 yuv3 = convert_to_yuv(c3);
+
+    write_imagef(dst1, (int2)(x, y), (float4)(yuv0.x, 0.0f, 0.0f, 1.0f));
+    write_imagef(dst1, (int2)(x+1, y), (float4)(yuv1.x, 0.0f, 0.0f, 1.0f));
+    write_imagef(dst1, (int2)(x, y+1), (float4)(yuv2.x, 0.0f, 0.0f, 1.0f));
+    write_imagef(dst1, (int2)(x+1, y+1), (float4)(yuv3.x, 0.0f, 0.0f, 1.0f));
+#ifdef THIRD_PLANE
+    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, 0.0f, 0.0f, 1.0f));
+    write_imagef(dst3, (int2)(xi, yi), (float4)(yuv0.z, 0.0f, 0.0f, 1.0f));
+#else
+    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, yuv0.z, 0.0f, 1.0f));
+#endif
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 4bb9969..c5b3f37 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -21,7 +21,9 @@ 
 
 extern const char *ff_opencl_source_avgblur;
 extern const char *ff_opencl_source_convolution;
+extern const char *ff_opencl_source_colorspace_basic;
 extern const char *ff_opencl_source_overlay;
+extern const char *ff_opencl_source_tonemap;
 extern const char *ff_opencl_source_unsharp;
 
 #endif /* AVFILTER_OPENCL_SOURCE_H */
diff --git a/libavfilter/vf_tonemap_opencl.c b/libavfilter/vf_tonemap_opencl.c
new file mode 100644
index 0000000..e2311e0
--- /dev/null
+++ b/libavfilter/vf_tonemap_opencl.c
@@ -0,0 +1,560 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <float.h>
+
+#include "libavutil/bprint.h"
+#include "libavutil/common.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mastering_display_metadata.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "video.h"
+#include "colorspace_basic.h"
+
+#define DETECTION_FRAMES 63
+#define REFERENCE_WHITE 100.0f
+
+enum TonemapAlgorithm {
+    TONEMAP_NONE,
+    TONEMAP_LINEAR,
+    TONEMAP_GAMMA,
+    TONEMAP_CLIP,
+    TONEMAP_REINHARD,
+    TONEMAP_HABLE,
+    TONEMAP_MOBIUS,
+    TONEMAP_MAX,
+};
+
+typedef struct TonemapOpenCLContext {
+    OpenCLFilterContext ocf;
+
+    enum AVColorSpace colorspace, colorspace_in, colorspace_out;
+    enum AVColorTransferCharacteristic trc, trc_in, trc_out;
+    enum AVColorPrimaries primaries, primaries_in, primaries_out;
+    enum AVColorRange range, range_in, range_out;
+
+    enum TonemapAlgorithm tonemap;
+    enum AVPixelFormat    format;
+    double                peak;
+    double                param;
+    int                   initialised;
+    cl_kernel             kernel;
+    cl_command_queue      command_queue;
+    cl_mem                util_mem;
+    DECLARE_ALIGNED(64, int32_t, util_buf)[2 * DETECTION_FRAMES + 7];
+} TonemapOpenCLContext;
+
+const char *yuv_coff[AVCOL_SPC_NB] = {
+    [AVCOL_SPC_BT709] = "rgb2yuv_bt709",
+    [AVCOL_SPC_BT2020_NCL] = "rgb2yuv_bt2020",
+};
+
+const char *rgb_coff[AVCOL_SPC_NB] = {
+    [AVCOL_SPC_BT709] = "yuv2rgb_bt709",
+    [AVCOL_SPC_BT2020_NCL] = "yuv2rgb_bt2020",
+};
+
+const char *linearize_funcs[AVCOL_TRC_NB] = {
+    [AVCOL_TRC_SMPTE2084] = "eotf_st2084",
+    [AVCOL_TRC_ARIB_STD_B67] = "inverse_oetf_hlg",
+};
+
+const char *ootf_funcs[AVCOL_TRC_NB] = {
+    [AVCOL_TRC_ARIB_STD_B67] = "ootf_hlg",
+    [AVCOL_TRC_SMPTE2084] = "",
+};
+
+const char *inverse_ootf_funcs[AVCOL_TRC_NB] = {
+    [AVCOL_TRC_ARIB_STD_B67] = "inverse_ootf_hlg",
+    [AVCOL_TRC_SMPTE2084] = "",
+};
+
+const char *delinearize_funcs[AVCOL_TRC_NB] = {
+    [AVCOL_TRC_BT709]     = "inverse_eotf_bt1886",
+    [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886",
+};
+
+static const struct LumaCoefficients luma_coefficients[AVCOL_SPC_NB] = {
+    [AVCOL_SPC_BT709]      = { 0.2126, 0.7152, 0.0722 },
+    [AVCOL_SPC_BT2020_NCL] = { 0.2627, 0.6780, 0.0593 },
+};
+
+struct ColorPrimaries primaries_table[AVCOL_PRI_NB] = {
+    [AVCOL_PRI_BT709]  = { 0.640, 0.330, 0.300, 0.600, 0.150, 0.060 },
+    [AVCOL_PRI_BT2020] = { 0.708, 0.292, 0.170, 0.797, 0.131, 0.046 },
+};
+
+struct WhitePoint whitepoint_table[AVCOL_PRI_NB] = {
+    [AVCOL_PRI_BT709]  = { 0.3127, 0.3290 },
+    [AVCOL_PRI_BT2020] = { 0.3127, 0.3290 },
+};
+
+const char *tonemap_func[TONEMAP_MAX] = {
+    [TONEMAP_NONE]     = "direct",
+    [TONEMAP_LINEAR]   = "linear",
+    [TONEMAP_GAMMA]    = "gamma",
+    [TONEMAP_CLIP]     = "clip",
+    [TONEMAP_REINHARD] = "reinhard",
+    [TONEMAP_HABLE]    = "hable",
+    [TONEMAP_MOBIUS]   = "mobius",
+};
+
+static void get_rgb2rgb_matrix(enum AVColorPrimaries in, enum AVColorPrimaries out,
+                               double rgb2rgb[3][3]) {
+    double rgb2xyz[3][3], xyz2rgb[3][3];
+
+    fill_rgb2xyz_table(&primaries_table[out], &whitepoint_table[out], rgb2xyz);
+    invert_matrix3x3(rgb2xyz, xyz2rgb);
+    fill_rgb2xyz_table(&primaries_table[in], &whitepoint_table[in], rgb2xyz);
+    mul3x3(rgb2rgb, rgb2xyz, xyz2rgb);
+}
+
+#define OPENCL_SOURCE_NB 3
+// Average light level for SDR signals. This is equal to a signal level of 0.5
+// under a typical presentation gamma of about 2.0.
+static const float sdr_avg = 0.25f;
+static const float scene_threshold = 0.2f;
+
+static int tonemap_opencl_init(AVFilterContext *avctx)
+{
+    TonemapOpenCLContext *ctx = avctx->priv;
+    int rgb2rgb_passthrough = 1;
+    double rgb2rgb[3][3];
+    struct LumaCoefficients luma_src, luma_dst;
+    cl_int cle;
+    int err;
+    AVBPrint header;
+    const char *opencl_sources[OPENCL_SOURCE_NB];
+
+    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
+
+    switch(ctx->tonemap) {
+    case TONEMAP_GAMMA:
+        if (isnan(ctx->param))
+            ctx->param = 1.8f;
+        break;
+    case TONEMAP_REINHARD:
+        if (!isnan(ctx->param))
+            ctx->param = (1.0f - ctx->param) / ctx->param;
+        break;
+    case TONEMAP_MOBIUS:
+        if (isnan(ctx->param))
+            ctx->param = 0.3f;
+        break;
+    }
+
+    if (isnan(ctx->param))
+        ctx->param = 1.0f;
+
+    av_bprintf(&header, "__constant const float tone_param = %.4ff;\n",
+               ctx->param);
+    av_bprintf(&header, "__constant const float sdr_avg = %.4ff;\n", sdr_avg);
+    av_bprintf(&header, "__constant const float scene_threshold = %.4ff;\n",
+               scene_threshold);
+    av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx->tonemap]);
+    av_bprintf(&header, "#define DETECTION_FRAMES %d\n", DETECTION_FRAMES);
+
+    if (ctx->primaries_out != ctx->primaries_in) {
+        get_rgb2rgb_matrix(ctx->primaries_in, ctx->primaries_out, rgb2rgb);
+        rgb2rgb_passthrough = 0;
+    }
+    if (ctx->range_in == AVCOL_RANGE_JPEG)
+        av_bprintf(&header, "#define FULL_RANGE_IN\n");
+
+    if (ctx->range_out == AVCOL_RANGE_JPEG)
+        av_bprintf(&header, "#define FULL_RANGE_OUT\n");
+
+    if (rgb2rgb_passthrough)
+        av_bprintf(&header, "#define RGB2RGB_PASSTHROUGH\n");
+    else {
+        av_bprintf(&header, "__constant float rgb2rgb[9] = {\n");
+        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
+                   rgb2rgb[0][0], rgb2rgb[0][1], rgb2rgb[0][2]);
+        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
+                   rgb2rgb[1][0], rgb2rgb[1][1], rgb2rgb[1][2]);
+        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff};\n",
+                   rgb2rgb[2][0], rgb2rgb[2][1], rgb2rgb[2][2]);
+    }
+
+    av_bprintf(&header, "#define rgb_matrix %s\n",
+               rgb_coff[ctx->colorspace_in]);
+    av_bprintf(&header, "#define yuv_matrix %s\n",
+               yuv_coff[ctx->colorspace_out]);
+
+    luma_src = luma_coefficients[ctx->colorspace_in];
+    luma_dst = luma_coefficients[ctx->colorspace_out];
+    av_bprintf(&header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
+               luma_src.cr, luma_src.cg, luma_src.cb);
+    av_bprintf(&header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
+               luma_dst.cr, luma_dst.cg, luma_dst.cb);
+
+    av_bprintf(&header, "#define linearize %s\n", linearize_funcs[ctx->trc_in]);
+    av_bprintf(&header, "#define delinearize %s\n",
+               delinearize_funcs[ctx->trc_out]);
+
+    av_bprintf(&header, "#define ootf_impl %s\n", ootf_funcs[ctx->trc_in]);
+    av_bprintf(&header, "#define inverse_ootf_impl %s\n",
+               inverse_ootf_funcs[ctx->trc_in]);
+
+
+    opencl_sources[0] = header.str;
+    opencl_sources[1] = ff_opencl_source_tonemap;
+    opencl_sources[2] = ff_opencl_source_colorspace_basic;
+    err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
+
+    av_bprint_finalize(&header, NULL);
+    if (err < 0)
+        goto fail;
+
+    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+                                              ctx->ocf.hwctx->device_id,
+                                              0, &cle);
+    if (!ctx->command_queue) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
+               "command queue: %d.\n", cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    ctx->kernel = clCreateKernel(ctx->ocf.program, "tonemap", &cle);
+    if (!ctx->kernel) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    ctx->util_mem = clCreateBuffer(ctx->ocf.hwctx->context,
+                                   CL_MEM_USE_HOST_PTR |
+                                   CL_MEM_HOST_NO_ACCESS,
+                                   sizeof(ctx->util_buf), ctx->util_buf, &cle);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create util buffer: %d.\n", cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    ctx->initialised = 1;
+    return 0;
+
+fail:
+    if (ctx->util_mem)
+        clReleaseMemObject(ctx->util_mem);
+    if (ctx->command_queue)
+        clReleaseCommandQueue(ctx->command_queue);
+    if (ctx->kernel)
+        clReleaseKernel(ctx->kernel);
+    return err;
+}
+
+static int tonemap_opencl_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *avctx = outlink->src;
+    TonemapOpenCLContext *s = avctx->priv;
+    int ret;
+    s->ocf.output_format = s->format == AV_PIX_FMT_NONE ? AV_PIX_FMT_NV12 : s->format;
+    ret = ff_opencl_filter_config_output(outlink);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int launch_kernel(AVFilterContext *avctx, cl_kernel kernel,
+                         AVFrame *output, AVFrame *input, float peak) {
+    TonemapOpenCLContext *ctx = avctx->priv;
+    int err = AVERROR(ENOSYS);
+    size_t global_work[2];
+    size_t local_work[2];
+    cl_int cle;
+
+    cle = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output->data[0]);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
+               "destination image 1st plane: %d.\n", cle);
+        return AVERROR(EINVAL);
+    }
+
+    cle = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->data[1]);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
+               "destination image 2nd plane: %d.\n", cle);
+        return AVERROR(EINVAL);
+    }
+
+    cle = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input->data[0]);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
+               "source image 1st plane: %d.\n", cle);
+        return AVERROR(EINVAL);
+    }
+
+    cle = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input->data[1]);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
+               "source image 2nd plane: %d.\n", cle);
+        return AVERROR(EINVAL);
+    }
+
+    cle = clSetKernelArg(kernel, 4, sizeof(cl_mem), &ctx->util_mem);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
+               "source image 2nd plane: %d.\n", cle);
+        return AVERROR(EINVAL);
+    }
+
+    cle = clSetKernelArg(kernel, 5, sizeof(cl_float), &peak);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
+               "peak luma: %d.\n", cle);
+        return AVERROR(EINVAL);
+    }
+
+    local_work[0]  = 16;
+    local_work[1]  = 16;
+    // Note the work size based on uv plane, as we process a 2x2 quad in one workitem
+    err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
+                                                1, 16);
+    if (err < 0)
+        return err;
+
+    cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
+                                 global_work, local_work,
+                                 0, NULL, NULL);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to enqueue kernel: %d.\n",
+               cle);
+        return AVERROR(EIO);
+    }
+    return 0;
+}
+
+static double determine_signal_peak(AVFrame *in)
+{
+    AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+    double peak = 0;
+
+    if (sd) {
+        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;
+        peak = clm->MaxCLL / REFERENCE_WHITE;
+    }
+
+    sd = av_frame_get_side_data(in, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+    if (!peak && sd) {
+        AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
+        if (metadata->has_luminance)
+            peak = av_q2d(metadata->max_luminance) / REFERENCE_WHITE;
+    }
+
+    // if not SMPTE2084, we would assume HLG
+    if (!peak)
+        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;
+
+    return peak;
+}
+
+static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
+{
+    AVFilterContext    *avctx = inlink->dst;
+    AVFilterLink     *outlink = avctx->outputs[0];
+    TonemapOpenCLContext *ctx = avctx->priv;
+    AVFrame *output = NULL;
+    cl_int cle;
+    int err;
+    double peak = ctx->peak;
+
+    AVHWFramesContext *input_frames_ctx =
+        (AVHWFramesContext*)input->hw_frames_ctx->data;
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(input->format),
+           input->width, input->height, input->pts);
+
+    if (!input->hw_frames_ctx)
+        return AVERROR(EINVAL);
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, input);
+    if (err < 0)
+        goto fail;
+
+    if (!peak)
+        peak = determine_signal_peak(input);
+
+    if (ctx->trc != -1)
+        output->color_trc = ctx->trc;
+    if (ctx->primaries != -1)
+        output->color_primaries = ctx->primaries;
+    if (ctx->colorspace != -1)
+        output->colorspace = ctx->colorspace;
+    if (ctx->range != -1)
+        output->color_range = ctx->range;
+
+    ctx->trc_in = input->color_trc;
+    ctx->trc_out = output->color_trc;
+    ctx->colorspace_in = input->colorspace;
+    ctx->colorspace_out = output->colorspace;
+    ctx->primaries_in = input->color_primaries;
+    ctx->primaries_out = output->color_primaries;
+    ctx->range_in = input->color_range;
+    ctx->range_out = output->color_range;
+
+    if (!ctx->initialised) {
+        err = tonemap_opencl_init(avctx);
+        if (err < 0)
+            goto fail;
+    }
+
+    switch(input_frames_ctx->sw_format) {
+    case AV_PIX_FMT_P010:
+        err = launch_kernel(avctx, ctx->kernel, output, input, peak);
+        if (err < 0) goto fail;
+        break;
+    default:
+        av_log(ctx, AV_LOG_ERROR, "unsupported format in tonemap_opencl.\n");
+        err = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    cle = clFinish(ctx->command_queue);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",
+               cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    av_frame_free(&input);
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(output->format),
+           output->width, output->height, output->pts);
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    clFinish(ctx->command_queue);
+    av_frame_free(&input);
+    av_frame_free(&output);
+    return err;
+}
+
+static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx)
+{
+    TonemapOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+
+    if (ctx->util_mem)
+        clReleaseMemObject(ctx->util_mem);
+    if (ctx->kernel) {
+        cle = clReleaseKernel(ctx->kernel);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "kernel: %d.\n", cle);
+    }
+
+    if (ctx->command_queue) {
+        cle = clReleaseCommandQueue(ctx->command_queue);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "command queue: %d.\n", cle);
+    }
+
+    ff_opencl_filter_uninit(avctx);
+}
+
+#define OFFSET(x) offsetof(TonemapOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption tonemap_opencl_options[] = {
+    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_NONE}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
+    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
+    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
+    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
+    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
+    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
+    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
+    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
+    { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
+    { "t",        "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
+    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},         0, 0, FLAGS, "transfer" },
+    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},     0, 0, FLAGS, "transfer" },
+    { "matrix", "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
+    { "m",      "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
+    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },
+    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },
+    { "primaries", "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
+    { "p",         "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
+    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},         0, 0, FLAGS, "primaries" },
+    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},        0, 0, FLAGS, "primaries" },
+    { "range",         "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
+    { "r",             "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
+    {     "tv",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
+    {     "pc",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
+    {     "limited",       0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
+    {     "full",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
+    { "format",    "output pixel format", OFFSET(format), AV_OPT_TYPE_INT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_GBRAP12LE, FLAGS, "fmt" },
+    {     "nv12",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_NV12},          0, 0, FLAGS, "fmt" },
+    {     "p010",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_P010},          0, 0, FLAGS, "fmt" },
+    { "peak",      "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
+    { "param",     "tonemap parameter",   OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(tonemap_opencl);
+
+static const AVFilterPad tonemap_opencl_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = &tonemap_opencl_filter_frame,
+        .config_props = &ff_opencl_filter_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad tonemap_opencl_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = &tonemap_opencl_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_tonemap_opencl = {
+    .name           = "tonemap_opencl",
+    .description    = NULL_IF_CONFIG_SMALL("perform HDR to SDR conversion with tonemapping"),
+    .priv_size      = sizeof(TonemapOpenCLContext),
+    .priv_class     = &tonemap_opencl_class,
+    .init           = &ff_opencl_filter_init,
+    .uninit         = &tonemap_opencl_uninit,
+    .query_formats  = &ff_opencl_filter_query_formats,
+    .inputs         = tonemap_opencl_inputs,
+    .outputs        = tonemap_opencl_outputs,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};