[FFmpeg-devel,RFC] lavfi: add scale_opencl filter.

Submitted by Ruiling Song on Jan. 30, 2019, 8:13 a.m.

Details

Message ID 1548835982-8118-1-git-send-email-ruiling.song@intel.com
State New
Headers show

Commit Message

Ruiling Song Jan. 30, 2019, 8:13 a.m.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
---
This patch depends on the colorspace patchset I sent before
(https://patchwork.ffmpeg.org/patch/11820/)
Although I am still working on some minor functionality,
hope somebody could give some comments about the overall design.

Ruiling

 configure                     |   1 +
 libavfilter/Makefile          |   2 +
 libavfilter/allfilters.c      |   1 +
 libavfilter/opencl/scale.cl   | 252 ++++++++++++++++
 libavfilter/opencl_source.h   |   1 +
 libavfilter/vf_scale_opencl.c | 682 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 939 insertions(+)
 create mode 100644 libavfilter/opencl/scale.cl
 create mode 100644 libavfilter/vf_scale_opencl.c

Patch hide | download patch | download mbox

diff --git a/configure b/configure
index ec8f70d..5640137 100755
--- a/configure
+++ b/configure
@@ -3450,6 +3450,7 @@  rubberband_filter_deps="librubberband"
 sab_filter_deps="gpl swscale"
 scale2ref_filter_deps="swscale"
 scale_filter_deps="swscale"
+scale_opencl_filter_deps="opencl"
 scale_qsv_filter_deps="libmfx"
 select_filter_select="scene_sad"
 sharpness_vaapi_filter_deps="vaapi"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index bc642ac..9de7d44 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -343,6 +343,8 @@  OBJS-$(CONFIG_SCALE_FILTER)                  += vf_scale.o scale.o
 OBJS-$(CONFIG_SCALE_CUDA_FILTER)             += vf_scale_cuda.o vf_scale_cuda.ptx.o \
                                                 cuda_check.o
 OBJS-$(CONFIG_SCALE_NPP_FILTER)              += vf_scale_npp.o scale.o cuda_check.o
+OBJS-$(CONFIG_SCALE_OPENCL_FILTER)           += vf_scale_opencl.o opencl.o \
+                                                opencl/scale.o
 OBJS-$(CONFIG_SCALE_QSV_FILTER)              += vf_scale_qsv.o
 OBJS-$(CONFIG_SCALE_VAAPI_FILTER)            += vf_scale_vaapi.o scale.o vaapi_vpp.o
 OBJS-$(CONFIG_SCALE2REF_FILTER)              += vf_scale.o scale.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c51ae0f..5708d16 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -325,6 +325,7 @@  extern AVFilter ff_vf_sab;
 extern AVFilter ff_vf_scale;
 extern AVFilter ff_vf_scale_cuda;
 extern AVFilter ff_vf_scale_npp;
+extern AVFilter ff_vf_scale_opencl;
 extern AVFilter ff_vf_scale_qsv;
 extern AVFilter ff_vf_scale_vaapi;
 extern AVFilter ff_vf_scale2ref;
diff --git a/libavfilter/opencl/scale.cl b/libavfilter/opencl/scale.cl
new file mode 100644
index 0000000..5d3deda
--- /dev/null
+++ b/libavfilter/opencl/scale.cl
@@ -0,0 +1,252 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+extern float3 yuv2rgb(float, float, float);
+extern float3 rgb2yuv(float, float, float);
+
+const sampler_t sampler_nearest = (CLK_NORMALIZED_COORDS_FALSE |
+                                   CLK_ADDRESS_CLAMP |
+                                   CLK_FILTER_NEAREST);
+
+const sampler_t sampler_linear = (CLK_NORMALIZED_COORDS_FALSE |
+                                  CLK_ADDRESS_CLAMP |
+                                  CLK_FILTER_LINEAR);
+
+float4 neighbor(image2d_t img, float vscale,
+                float hscale, int x, int y,
+                __constant float *coff_x,
+                __constant float *coff_y,
+                int2 filter_size)
+{
+    float xi = ((float)x + 0.5f) * hscale;
+    float yi = ((float)y + 0.5f) * vscale;
+
+    return read_imagef(img, sampler_nearest, (float2)(xi, yi));
+}
+
+float4 bilinear(image2d_t img, float vscale,
+                float hscale, int x, int y,
+                __constant float *coff_x,
+                __constant float *coff_y,
+                int2 filter_size)
+{
+    float xi = ((float)x + 0.5f) * hscale;
+    float yi = ((float)y + 0.5f) * vscale;
+
+    return read_imagef(img, sampler_linear, (float2)(xi, yi));
+}
+
+float4 generic_filter(image2d_t img, float vscale, float hscale, int x, int y,
+                      __constant float *coff_x, __constant float *coff_y,
+                      int2 filter_size)
+{
+    int2 dst_pos = (int2)(x, y);
+    float2 src_coord = (convert_float2(dst_pos) + 0.5f) *
+                       (float2)(hscale, vscale);
+    int2 src_pos = convert_int2(floor(src_coord - 0.5f));
+
+    float4 color = 0.0f;
+    for (int i = 0; i < filter_size.y; ++i) {
+        float4 sum = 0.0f;
+        for (int j = 0; j < filter_size.x; ++j) {
+            int x_offset = filter_size.x / 2 - j;
+            int y_offset = filter_size.y / 2 - i;
+            float4 c = read_imagef(img, sampler_nearest,
+                                   src_pos + (int2)(x_offset, y_offset));
+            sum += c * coff_x[dst_pos.x * filter_size.x + j];
+        }
+        color += sum * coff_y[dst_pos.y * filter_size.y + i];
+    }
+    return color;
+}
+
+__kernel void scale(__write_only image2d_t dst,
+                    __read_only  image2d_t src,
+                    int dst_width, int dst_height,
+                    int src_width, int src_height,
+                    __constant float *coff_x,
+                    __constant float *coff_y,
+                    int2 filter_size)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    float vscale = (float)src_height / (float)dst_height;
+    float hscale = (float)src_width / (float)dst_width;
+
+    float4 sum = algorithm(src, vscale, hscale, x, y,
+                           coff_x, coff_y, filter_size);
+
+    if (x < dst_width && y < dst_height)
+        write_imagef(dst, (int2)(x, y), sum);
+}
+
+// read chroma value from 'img', 'color[4]' contains the results.
+inline void read_chroma(float4 color[4], image2d_t img,
+                        float scalev, float scaleh, int xchr, int ychr,
+                        __constant float *coff_x, __constant float *coff_y,
+                        int2 filter_size, int chroma_h, int chroma_v)
+{
+    color[0] = algorithm(img, scalev, scaleh, xchr, ychr, coff_x, coff_y,
+                         filter_size);
+
+    if (chroma_v == 2) {
+        color[2] = algorithm(img, scalev, scaleh, xchr, ychr + 1, coff_x, coff_y,
+                             filter_size);
+        if (chroma_h == 2) {
+            color[1] = algorithm(img, scalev, scaleh, xchr + 1, ychr, coff_x,
+                                 coff_y, filter_size);
+            color[3] = algorithm(img, scalev, scaleh, xchr + 1, ychr + 1, coff_x,
+                                 coff_y, filter_size);
+        } else {
+            color[1] = color[0];
+            color[3] = color[2];
+        }
+    } else {
+        color[1] = color[2] = color[3] = color[0];
+    }
+}
+
+inline void write_chroma(write_only image2d_t img, float4 c[4],
+                         int xchr, int ychr, int chroma_h, int chroma_v)
+{
+    write_imagef(img, (int2)(xchr, ychr), c[0]);
+
+    if (chroma_v == 2) {
+        write_imagef(img, (int2)(xchr, ychr + 1), c[2]);
+        if (chroma_h == 2) {
+            write_imagef(img, (int2)(xchr + 1, ychr), c[1]);
+            write_imagef(img, (int2)(xchr + 1, ychr + 1), c[3]);
+        }
+    }
+}
+
+__kernel void convert(__write_only image2d_t dst0,
+                      __write_only image2d_t dst1,
+                      __write_only image2d_t dst2,
+                      __write_only image2d_t dst3,
+                      __read_only  image2d_t src0,
+                      __read_only  image2d_t src1,
+                      __read_only  image2d_t src2,
+                      __read_only  image2d_t src3,
+                      int dst_width, int dst_height,
+                      int src_width, int src_height,
+                      __constant float *coff_x,
+                      __constant float *coff_y,
+                      int2 filter_size)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int x2 = x * 2;
+    int y2 = y * 2;
+    int xchr = SRC_CHROMA_H * x;
+    int ychr = SRC_CHROMA_V * y;
+    int i;
+    float4 color[4], color0[4], color1[4], color2[4], color3[4];
+
+    float scalev = (float)src_height / (float)dst_height;
+    float scaleh = (float)src_width / (float)dst_width;
+    color0[0] = algorithm(src0, scalev, scaleh, x2,     y2,
+                          coff_x, coff_y, filter_size);
+    color0[1] = algorithm(src0, scalev, scaleh, x2 + 1, y2,
+                          coff_x, coff_y, filter_size);
+    color0[2] = algorithm(src0, scalev, scaleh, x2,     y2 + 1,
+                          coff_x, coff_y, filter_size);
+    color0[3] = algorithm(src0, scalev, scaleh, x2 + 1, y2 + 1,
+                          coff_x, coff_y, filter_size);
+    #pragma unroll
+    for (i = 0; i < 4; i++)
+        color[i] = color0[i];
+
+    if (SRC_IMGS > 1) {
+        read_chroma(color1, src1, scalev, scaleh, xchr, ychr,
+                    coff_x, coff_y, filter_size,
+                    SRC_CHROMA_H, SRC_CHROMA_V);
+        #pragma unroll
+        for (i = 0; i < 4; i++)
+            color[i].yz = color1[i].xy;
+    }
+
+    if (SRC_IMGS > 2) {
+        read_chroma(color2, src2, scalev, scaleh, xchr, ychr, 
+                    coff_x, coff_y, filter_size,
+                    SRC_CHROMA_H, SRC_CHROMA_V);
+        #pragma unroll
+        for (i = 0; i < 4; i++)
+            color[i].z = color2[i].x;
+    }
+
+    if (SRC_IMGS > 3) {
+        color3[0] = algorithm(src3, scalev, scaleh, x2,     y2,
+                              coff_x, coff_y, filter_size);
+        color3[1] = algorithm(src3, scalev, scaleh, x2 + 1, y2,
+                              coff_x, coff_y, filter_size);
+        color3[2] = algorithm(src3, scalev, scaleh, x2,     y2 + 1,
+                              coff_x, coff_y, filter_size);
+        color3[3] = algorithm(src3, scalev, scaleh, x2 + 1, y2 + 1,
+                              coff_x, coff_y, filter_size);
+        #pragma unroll
+        for (i = 0; i < 4; i++)
+            color[i].w = color3[i].x;
+    }
+
+    // possible yuv-rgb conversion here
+    #ifdef YUV2RGB
+    #pragma unroll
+    for (i = 0; i < 4; i++)
+        color[i].xyz = yuv2rgb(color[i].x, color[i].y, color[i].z);
+    #endif
+
+    #ifdef RGB2YUV
+    #pragma unroll
+    for (i = 0; i < 4; i++)
+        color[i].xyz = rgb2yuv(color[i].x, color[i].y, color[i].z);
+
+    #endif
+
+    xchr = DST_CHROMA_H * x;
+    ychr = DST_CHROMA_V * y;
+
+    if (x2 < dst_width && y2 < dst_height) {
+        write_imagef(dst0, (int2)(x2,     y2    ), color[0]);
+        write_imagef(dst0, (int2)(x2 + 1, y2    ), color[1]);
+        write_imagef(dst0, (int2)(x2,     y2 + 1), color[2]);
+        write_imagef(dst0, (int2)(x2 + 1, y2 + 1), color[3]);
+    }
+
+    if (DST_IMGS > 1 && x2 < dst_width && y2 < dst_height) {
+        float4 c2[4];
+        #pragma unroll
+        for (i = 0; i < 4; i++)
+            c2[i] = color[i].yzyz;
+        write_chroma(dst1, c2, xchr, ychr, DST_CHROMA_H, DST_CHROMA_V);
+
+        if (DST_IMGS > 2) {
+            #pragma unroll
+            for (i = 0; i < 4; i++)
+                c2[i] = color[i].zzzz;
+            write_chroma(dst2, c2, xchr, ychr, DST_CHROMA_H, DST_CHROMA_V);
+        }
+
+        if (DST_IMGS > 3) {
+            write_imagef(dst3, (int2)(x2, y2), color[0].wwww);
+            write_imagef(dst3, (int2)(x2 + 1, y2), color[1].wwww);
+            write_imagef(dst3, (int2)(x2, y2 + 1), color[2].wwww);
+            write_imagef(dst3, (int2)(x2 + 1, y2 + 1), color[3].wwww);
+        }
+    }
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 4118138..3dd7634 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -24,6 +24,7 @@  extern const char *ff_opencl_source_colorspace_common;
 extern const char *ff_opencl_source_convolution;
 extern const char *ff_opencl_source_neighbor;
 extern const char *ff_opencl_source_overlay;
+extern const char *ff_opencl_source_scale;
 extern const char *ff_opencl_source_tonemap;
 extern const char *ff_opencl_source_transpose;
 extern const char *ff_opencl_source_unsharp;
diff --git a/libavfilter/vf_scale_opencl.c b/libavfilter/vf_scale_opencl.c
new file mode 100644
index 0000000..929a5dd
--- /dev/null
+++ b/libavfilter/vf_scale_opencl.c
@@ -0,0 +1,682 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/bprint.h"
+#include "libavutil/common.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "colorspace.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "scale.h"
+#include "video.h"
+
+enum ScaleAlgorithm {
+  SCALE_NEIGHBOR = 0,
+  SCALE_BILINEAR,
+  SCALE_BICUBIC,
+};
+
+typedef struct SurfaceInfo {
+    int plane_id[4];
+    int nb_planes;
+    int is_rgb;
+    int width;
+    int height;
+    int chroma_v; /// value of (chroma_height / half_luma_height)
+    int chroma_h; /// value of (chroma_width / half_luma_width)
+} SurfaceInfo;
+
+typedef struct ScaleInfo {
+    SurfaceInfo src;
+    SurfaceInfo dst;
+} ScaleInfo;
+
+typedef struct ScaleOpenCLContext {
+    OpenCLFilterContext ocf;
+    enum AVPixelFormat format;
+    enum AVColorSpace colorspace, colorspace_in, colorspace_out;
+    enum AVColorRange range, range_in, range_out;
+    enum AVChromaLocation chroma_loc;
+    enum ScaleAlgorithm algorithm;
+
+    char            *w_expr;
+    char            *h_expr;
+    char            *format_str;
+    int              initialised;
+    cl_kernel        kernel;
+    cl_command_queue command_queue;
+    cl_mem           coff_x;
+    cl_mem           coff_y;
+    cl_int2          filter_size;
+    ScaleInfo        job;
+} ScaleOpenCLContext;
+
+static const char *algo_str[3] =
+{
+    "neighbor",
+    "bilinear",
+    "generic_filter"
+};
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_RGBA,
+    AV_PIX_FMT_BGRA,
+    AV_PIX_FMT_ARGB
+};
+
+static void gather_surface_info(AVFilterContext *avctx, SurfaceInfo *info,
+                                AVFrame *frame)
+{
+    const AVPixFmtDescriptor *desc;
+    AVHWFramesContext *hwctx;
+    enum AVPixelFormat format;
+    int plane_id, plane_id_last;
+    int i, nb_comp;
+
+    hwctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
+    format = hwctx->sw_format;
+    desc = av_pix_fmt_desc_get(format);
+    info->is_rgb = desc->flags & AV_PIX_FMT_FLAG_RGB;
+
+    nb_comp = desc->nb_components;
+    plane_id_last = -1;
+
+    info->width = frame->width;
+    info->height = frame->height;
+    info->chroma_h = 2 / (1 << desc->log2_chroma_w);
+    info->chroma_v = 2 / (1 << desc->log2_chroma_h);
+    for (i = 0; i < nb_comp; i++) {
+        plane_id = desc->comp[i].plane;
+
+        if (plane_id != plane_id_last) {
+            info->plane_id[info->nb_planes] = plane_id;
+            info->nb_planes++;
+        }
+        plane_id_last = plane_id;
+    }
+}
+
+static int init_jobs(AVFilterContext *avctx, AVFrame *in, AVFrame *out)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    ScaleInfo *info = &ctx->job;
+
+    gather_surface_info(avctx, &info->src, in);
+    gather_surface_info(avctx, &info->dst, out);
+
+    av_log(avctx, AV_LOG_DEBUG, "Scale Job Info:\n");
+    av_log(avctx, AV_LOG_DEBUG, "src: planes: %d width: %d height: %d, "
+           "plane-id (%d %d %d %d)\n", info->src.nb_planes,
+           info->src.width, info->src.height,
+           info->src.plane_id[0], info->src.plane_id[1],
+           info->src.plane_id[2], info->src.plane_id[3]);
+
+    av_log(avctx, AV_LOG_DEBUG, "dst: planes: %d width: %d height: %d, "
+           "plane-id (%d %d %d %d)\n", info->dst.nb_planes,
+           info->dst.width, info->dst.height,
+           info->dst.plane_id[0], info->dst.plane_id[1],
+           info->dst.plane_id[2], info->dst.plane_id[3]);
+
+    return 0;
+}
+
+static const int filter_size[] = {
+  [SCALE_NEIGHBOR] = 1,
+  [SCALE_BILINEAR] = 1,
+  [SCALE_BICUBIC] = 4,
+};
+
+static float catmullrom(float x)
+{
+    float x2, x3;
+    const float B = 0.0f;
+    const float C = 0.5f;
+    x = x < 0.0f ? -x : x;
+    x2 = x * x;
+    x3 = x2 * x;
+
+    if(x < 1.0f) {
+        return ((12 - 9 * B - 6 * C) * x3 +
+                (-18 + 12 * B + 6 * C) * x2 +
+                (6 - 2 * B)) / 6.0f;
+    } else if (x >= 1.0 && x < 2.0f) {
+        return (( -B - 6 * C) * x3 +
+                ( 6 * B + 30 * C ) * x2 +
+                (-( 12 * B ) - 48 * C) * x +
+                8 * B + 24 * C) / 6.0f;
+    } else {
+        return 0.0f;
+    }
+}
+
+static float filter(enum ScaleAlgorithm algo, float f)
+{
+    switch (algo) {
+    case SCALE_BICUBIC:
+        return catmullrom(f);
+    default:
+        return f;
+    }
+}
+
+static void fill_filter_table(AVFilterContext *avctx, float *buf, int length,
+                              int filter_size, float scale)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    int i, k;
+    float sum;
+    for (i = 0; i < length; i++) {
+        float dst_in_src = (i + 0.5) * scale - 0.5;
+        float t = dst_in_src - floor(dst_in_src);
+        sum = 0.0;
+        for (k = 0; k < filter_size; k++) {
+            float fpos = ((float)(filter_size / 2 - k) - t);
+            float ff = filter(ctx->algorithm, fpos / FFMIN(scale, 1.0));
+            buf[i * filter_size + k] = ff;
+            sum += ff;
+        }
+
+        for (k = 0; k < filter_size; k++) {
+            buf[i * filter_size + k] /= sum;
+        }
+    }
+}
+static int prepare_opencl_header(AVFilterContext *avctx, AVBPrint *header)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    double rgb2yuv[3][3], yuv2rgb[3][3];
+    const struct LumaCoefficients *luma_src;
+
+    luma_src = ff_get_luma_coefficients(ctx->colorspace_in);
+    if (!luma_src) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported input colorspace %d\n",
+               ctx->colorspace_in);
+        return AVERROR(EINVAL);
+    }
+    // fill this value as it is required to compile opencl program successfully
+    // currently we only support source & destination with same color-space.
+    av_bprintf(header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
+               luma_src->cr, luma_src->cg, luma_src->cb);
+    av_bprintf(header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
+               luma_src->cr, luma_src->cg, luma_src->cb);
+    av_bprintf(header, "#define RGB2RGB_PASSTHROUGH\n");
+
+    ff_fill_rgb2yuv_table(luma_src, rgb2yuv);
+    ff_opencl_print_const_matrix_3x3(header, "yuv_matrix", rgb2yuv);
+    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
+    ff_opencl_print_const_matrix_3x3(header, "rgb_matrix", yuv2rgb);
+
+    if (ctx->job.src.is_rgb && !ctx->job.dst.is_rgb) {
+        av_bprintf(header, "#define RGB2YUV\n");
+    } else if (!ctx->job.src.is_rgb && ctx->job.dst.is_rgb) {
+        av_bprintf(header, "#define YUV2RGB\n");
+    }
+
+    if (!ctx->job.src.is_rgb && ctx->range_in == AVCOL_RANGE_JPEG)
+        av_bprintf(header, "#define FULL_RANGE_IN\n");
+
+    if (!ctx->job.dst.is_rgb && ctx->range_out == AVCOL_RANGE_JPEG)
+        av_bprintf(header, "#define FULL_RANGE_OUT\n");
+
+    av_bprintf(header, "#define chroma_loc %d\n", (int)ctx->chroma_loc);
+    av_bprintf(header, "#define algorithm %s\n", algo_str[ctx->algorithm]);
+    av_bprintf(header, "#define DST_IMGS %d\n", ctx->job.dst.nb_planes);
+    av_bprintf(header, "#define SRC_IMGS %d\n", ctx->job.src.nb_planes);
+    av_bprintf(header, "#define SRC_CHROMA_V %d\n", ctx->job.src.chroma_v);
+    av_bprintf(header, "#define SRC_CHROMA_H %d\n", ctx->job.src.chroma_h);
+    av_bprintf(header, "#define DST_CHROMA_V %d\n", ctx->job.dst.chroma_v);
+    av_bprintf(header, "#define DST_CHROMA_H %d\n", ctx->job.dst.chroma_h);
+    return 0;
+}
+#define OPENCL_SOURCE_NB 3
+static int scale_opencl_init(AVFilterContext *avctx)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    const char *opencl_sources[OPENCL_SOURCE_NB];
+    float *coff_x = NULL, *coff_y = NULL;
+    cl_int cle;
+    int err;
+    int filter_size_x, filter_size_y, size_factor;
+    float vscale, hscale;
+    const char *k_name;
+    AVBPrint header;
+
+    size_factor = filter_size[ctx->algorithm];
+    vscale = ctx->job.src.height / ctx->job.dst.height;
+    hscale = ctx->job.src.width / ctx->job.dst.width;
+
+    filter_size_x = ceil(size_factor * FFMAX(1, hscale));
+    filter_size_y = ceil(size_factor * FFMAX(1, vscale));
+    ctx->filter_size.s[0] = filter_size_x;
+    ctx->filter_size.s[1] = filter_size_y;
+
+    if (ctx->algorithm != SCALE_BILINEAR &&
+        ctx->algorithm != SCALE_NEIGHBOR) {
+        // we don't need such coefficients for bilinear & neighbor
+        coff_x = av_malloc_array(filter_size_x * ctx->job.dst.width,
+                                 sizeof(float));
+        coff_y = av_malloc_array(filter_size_y * ctx->job.dst.height,
+                                 sizeof(float));
+        if (!coff_x || !coff_y) {
+            goto fail;
+        }
+
+        fill_filter_table(avctx, coff_x, ctx->job.dst.width, filter_size_x,
+                          hscale);
+        fill_filter_table(avctx, coff_y, ctx->job.dst.height, filter_size_y,
+                          vscale);
+
+        ctx->coff_x = clCreateBuffer(ctx->ocf.hwctx->context,
+                                     CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
+                                     ctx->job.dst.width * filter_size_x *
+                                        sizeof(cl_float),
+                                     coff_x, &cle);
+
+        ctx->coff_y = clCreateBuffer(ctx->ocf.hwctx->context,
+                                     CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
+                                     ctx->job.dst.height * filter_size_y *
+                                        sizeof(cl_float),
+                                     coff_y, &cle);
+        av_freep(&coff_x);
+        av_freep(&coff_y);
+    }
+
+    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
+    err = prepare_opencl_header(avctx, &header);
+    if (err < 0)
+        goto fail;
+    av_log(avctx, AV_LOG_DEBUG, "Generated OpenCL header:\n%s\n", header.str);
+    opencl_sources[0] = header.str;
+    opencl_sources[1] = ff_opencl_source_scale;
+    opencl_sources[2] = ff_opencl_source_colorspace_common;
+    err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
+
+    av_bprint_finalize(&header, NULL);
+
+    if (err < 0)
+        goto fail;
+
+    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+                                              ctx->ocf.hwctx->device_id,
+                                              0, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
+                     "command queue %d.\n", cle);
+
+    k_name = ctx->format != AV_PIX_FMT_NONE ? "convert" : "scale";
+    ctx->kernel = clCreateKernel(ctx->ocf.program, k_name, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create kernel %d.\n", cle);
+
+    ctx->initialised = 1;
+    return 0;
+
+fail:
+    if (ctx->command_queue)
+        clReleaseCommandQueue(ctx->command_queue);
+    if (ctx->kernel)
+        clReleaseKernel(ctx->kernel);
+
+    if (coff_x)
+        av_freep(&coff_x);
+    if (coff_y)
+        av_freep(&coff_y);
+
+    av_bprint_finalize(&header, NULL);
+    return err;
+}
+
+static int is_fmt_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+
+    return 0;
+}
+
+static int scale_opencl_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *avctx = outlink->src;
+    ScaleOpenCLContext *s = avctx->priv;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    AVHWFramesContext *in_frames_ctx;
+    enum AVPixelFormat in_format;
+    int w, h, ret;
+
+    if (!strcmp(s->format_str, "same")) {
+        s->format = AV_PIX_FMT_NONE;
+    } else {
+        s->format = av_get_pix_fmt(s->format_str);
+        if (s->format == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    if ((ret = ff_scale_eval_dimensions(s,
+                                        s->w_expr, s->h_expr,
+                                        inlink, outlink,
+                                        &w, &h)) < 0)
+        return ret;
+
+    s->ocf.output_width = w;
+    s->ocf.output_height = h;
+    if (s->format != AV_PIX_FMT_NONE)
+        s->ocf.output_format = s->format;
+
+    ret = ff_opencl_filter_config_output(outlink);
+    if (ret < 0)
+        return ret;
+
+    in_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+    in_format  = in_frames_ctx->sw_format;
+
+    if (!is_fmt_supported(in_format)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported input format %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+
+    if (!is_fmt_supported(s->ocf.output_format)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported output format %s\n",
+               av_get_pix_fmt_name(s->ocf.output_format));
+        return AVERROR(ENOSYS);
+    }
+
+    if (inlink->sample_aspect_ratio.num)
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
+                                                             outlink->w*inlink->h},
+                                                inlink->sample_aspect_ratio);
+    else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    return 0;
+}
+
+static int enqueue_simple_scale(AVFilterContext *avctx, cl_command_queue queue,
+                                cl_kernel kernel, AVFrame *input,
+                                AVFrame *output)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    size_t global_work[2];
+    cl_int cle;
+    cl_mem src, dst;
+    cl_int src_width, src_height, dst_width, dst_height;
+    int err, plane, is_chroma;
+
+    for (plane = 0; plane < FF_ARRAY_ELEMS(output->data); plane++) {
+        dst = (cl_mem)output->data[plane];
+        src = (cl_mem)input->data[plane];
+
+        if (!dst) break;
+
+        is_chroma = (plane == 1 || plane == 2);
+        if (!is_chroma) {
+            src_width = input->width;
+            src_height = input->height;
+            dst_width = output->width;
+            dst_height = output->height;
+        } else {
+            src_width = input->width * ctx->job.src.chroma_h / 2;
+            src_height = input->height * ctx->job.src.chroma_v / 2;
+            dst_width = output->width * ctx->job.dst.chroma_h / 2;
+            dst_height = output->height * ctx->job.dst.chroma_v / 2;
+        }
+
+        CL_SET_KERNEL_ARG(kernel, 0, cl_mem, &dst);
+        CL_SET_KERNEL_ARG(kernel, 1, cl_mem, &src);
+        CL_SET_KERNEL_ARG(kernel, 2, cl_int, &dst_width);
+        CL_SET_KERNEL_ARG(kernel, 3, cl_int, &dst_height);
+        CL_SET_KERNEL_ARG(kernel, 4, cl_int, &src_width);
+        CL_SET_KERNEL_ARG(kernel, 5, cl_int, &src_height);
+        CL_SET_KERNEL_ARG(kernel, 6, cl_mem, &ctx->coff_x);
+        CL_SET_KERNEL_ARG(kernel, 7, cl_mem, &ctx->coff_y);
+        CL_SET_KERNEL_ARG(kernel, 8, cl_int2, &ctx->filter_size);
+
+        err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
+                                                    plane, 8);
+        if (err < 0)
+            return err;
+
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
+                                     global_work, NULL,
+                                     0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
+
+    }
+    return 0;
+fail:
+    return err;
+}
+
+static int enqueue_convert(AVFilterContext *avctx, cl_command_queue queue,
+                           cl_kernel kernel, AVFrame *in, AVFrame *out)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+    cl_mem mem;
+    int err;
+    size_t global_work[2];
+
+    global_work[0] = ctx->job.dst.width / 2;
+    global_work[1] = ctx->job.dst.height / 2;
+#define SET_DST(idx) \
+mem = (cl_mem)out->data[ctx->job.dst.plane_id[idx]]; \
+CL_SET_KERNEL_ARG(kernel, idx, cl_mem, &mem);
+
+#define SET_SRC(idx) \
+mem = (cl_mem)in->data[ctx->job.src.plane_id[idx]]; \
+CL_SET_KERNEL_ARG(kernel, idx + 4, cl_mem, &mem);
+
+    SET_DST(0)
+    SET_DST(1)
+    SET_DST(2)
+    SET_DST(3)
+
+    SET_SRC(0)
+    SET_SRC(1)
+    SET_SRC(2)
+    SET_SRC(3)
+#undef SET_DST
+#undef SET_SRC
+    CL_SET_KERNEL_ARG(kernel, 8, cl_int, &ctx->job.dst.width);
+    CL_SET_KERNEL_ARG(kernel, 9, cl_int, &ctx->job.dst.height);
+    CL_SET_KERNEL_ARG(kernel, 10, cl_int, &ctx->job.src.width);
+    CL_SET_KERNEL_ARG(kernel, 11, cl_int, &ctx->job.src.height);
+    CL_SET_KERNEL_ARG(kernel, 12, cl_mem, &ctx->coff_x);
+    CL_SET_KERNEL_ARG(kernel, 13, cl_mem, &ctx->coff_y);
+    CL_SET_KERNEL_ARG(kernel, 14, cl_int2, &ctx->filter_size);
+
+    cle = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+                                 global_work, NULL,
+                                 0, NULL, NULL);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
+
+    return 0;
+fail:
+    return err;
+}
+
+static int scale_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
+{
+    AVFilterContext    *avctx = inlink->dst;
+    AVFilterLink     *outlink = avctx->outputs[0];
+    ScaleOpenCLContext *ctx = avctx->priv;
+    AVFrame *output = NULL;
+    cl_kernel kernel;
+    cl_int cle;
+    int err;
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(input->format),
+           input->width, input->height, input->pts);
+
+    if (!input->hw_frames_ctx)
+        return AVERROR(EINVAL);
+
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, input);
+    if (err < 0)
+        goto fail;
+
+    if (ctx->colorspace != -1)
+        ctx->colorspace_in = ctx->colorspace;
+    else
+        ctx->colorspace_in = input->colorspace;
+    // default to bt601
+    if (ctx->colorspace_in == AVCOL_SPC_UNSPECIFIED)
+        ctx->colorspace_in = AVCOL_SPC_BT470BG;
+    ctx->range_in = input->color_range;
+
+    if (!ctx->initialised) {
+        err = init_jobs(avctx, input, output);
+        if (err < 0)
+            goto fail;
+
+        err = scale_opencl_init(avctx);
+        if (err < 0)
+            goto fail;
+    }
+
+    kernel = ctx->kernel;
+
+    if (ctx->format != AV_PIX_FMT_NONE)
+        err = enqueue_convert(avctx, ctx->command_queue, kernel, input, output);
+    else
+        err = enqueue_simple_scale(avctx, ctx->command_queue, kernel, input,
+                                   output);
+    if (err < 0)
+        goto fail;
+
+    cle = clFinish(ctx->command_queue);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish command queue: %d.\n", cle);
+
+    av_frame_free(&input);
+
+    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(output->format),
+           output->width, output->height, output->pts);
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    clFinish(ctx->command_queue);
+    av_frame_free(&input);
+    av_frame_free(&output);
+    return err;
+}
+
+static av_cold void scale_opencl_uninit(AVFilterContext *avctx)
+{
+    ScaleOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+
+    if (ctx->kernel) {
+        cle = clReleaseKernel(ctx->kernel);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "kernel: %d.\n", cle);
+    }
+
+    if (ctx->command_queue) {
+        cle = clReleaseCommandQueue(ctx->command_queue);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "command queue: %d.\n", cle);
+    }
+
+    if (ctx->coff_x) {
+        cle = clReleaseMemObject(ctx->coff_x);
+    }
+
+    if (ctx->coff_y) {
+        cle = clReleaseMemObject(ctx->coff_y);
+    }
+    ff_opencl_filter_uninit(avctx);
+}
+
+#define OFFSET(x) offsetof(ScaleOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption scale_opencl_options[] = {
+    { "w",      "Output video width",  OFFSET(w_expr),     AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags = FLAGS },
+    { "h",      "Output video height", OFFSET(h_expr),     AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags = FLAGS },
+    { "format", "Output pixel format", OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
+    { "in_color_matrix", "set input YCbCr type",   OFFSET(colorspace),  AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, FLAGS, "matrix" },
+    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },
+    {     "fcc",              0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_FCC},           0, 0, FLAGS, "matrix" },
+    {     "bt470bg",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT470BG},       0, 0, FLAGS, "matrix" },
+    {     "smpte170m",        0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE170M},     0, 0, FLAGS, "matrix" },
+    {     "smpte240m",        0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE240M},     0, 0, FLAGS, "matrix" },
+    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },
+    { "algorithm",     "Scaling algorithm",   OFFSET(algorithm), AV_OPT_TYPE_INT,    { .i64 = SCALE_BILINEAR }, INT_MIN, INT_MAX, FLAGS, "algorithm" },
+    {     "neighbor",  "nearest neighbor",    0,                 AV_OPT_TYPE_CONST,  { .i64 = SCALE_NEIGHBOR}, 0, 0, FLAGS, "algorithm" },
+    {     "bilinear",  "bilinear",            0,                 AV_OPT_TYPE_CONST,  { .i64 = SCALE_BILINEAR}, 0, 0, FLAGS, "algorithm" },
+    {     "bicubic",   "bicubic",             0,                 AV_OPT_TYPE_CONST,  { .i64 = SCALE_BICUBIC},  0, 0, FLAGS, "algorithm" },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(scale_opencl);
+
+static const AVFilterPad scale_opencl_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = &scale_opencl_filter_frame,
+        .config_props = &ff_opencl_filter_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad scale_opencl_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = &scale_opencl_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_scale_opencl = {
+    .name           = "scale_opencl",
+    .description    = NULL_IF_CONFIG_SMALL("OpenCL Scale and Format Conversion Filter"),
+    .priv_size      = sizeof(ScaleOpenCLContext),
+    .priv_class     = &scale_opencl_class,
+    .init           = &ff_opencl_filter_init,
+    .uninit         = &scale_opencl_uninit,
+    .query_formats  = &ff_opencl_filter_query_formats,
+    .inputs         = scale_opencl_inputs,
+    .outputs        = scale_opencl_outputs,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};