[FFmpeg-devel,12/14] lavfi: Add OpenCL overlay filter

Submitted by Mark Thompson on Sept. 10, 2017, 8:53 p.m.

Details

Message ID 20170910205338.29687-13-sw@jkqxz.net
State New
Headers show

Commit Message

Mark Thompson Sept. 10, 2017, 8:53 p.m.
Input and output formats must be the same, the overlay format must be
the same as the input except possibly with an additional alpha component.
---
 configure                       |   1 +
 libavfilter/Makefile            |   2 +
 libavfilter/allfilters.c        |   1 +
 libavfilter/opencl/overlay.cl   | 104 ++++++++++++
 libavfilter/opencl_source.h     |   2 +
 libavfilter/vf_overlay_opencl.c | 347 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 457 insertions(+)
 create mode 100644 libavfilter/opencl/overlay.cl
 create mode 100644 libavfilter/vf_overlay_opencl.c

Comments

Nicolas George Sept. 10, 2017, 9:10 p.m.
Le quartidi 24 fructidor, an CCXXV, Mark Thompson a écrit :
> Input and output formats must be the same, the overlay format must be
> the same as the input except possibly with an additional alpha component.
> ---
>  configure                       |   1 +
>  libavfilter/Makefile            |   2 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/opencl/overlay.cl   | 104 ++++++++++++
>  libavfilter/opencl_source.h     |   2 +
>  libavfilter/vf_overlay_opencl.c | 347 ++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 457 insertions(+)
>  create mode 100644 libavfilter/opencl/overlay.cl
>  create mode 100644 libavfilter/vf_overlay_opencl.c
> 
> diff --git a/configure b/configure
> index 895ae2ec38..c036a53a69 100755
> --- a/configure
> +++ b/configure
> @@ -3178,6 +3178,7 @@ negate_filter_deps="lut_filter"
>  nnedi_filter_deps="gpl"
>  ocr_filter_deps="libtesseract"
>  ocv_filter_deps="libopencv"
> +overlay_opencl_filter_deps="opencl"
>  owdenoise_filter_deps="gpl"
>  pan_filter_deps="swresample"
>  perspective_filter_deps="gpl"
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index cb3a1424d9..cc9d4021b8 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -249,6 +249,8 @@ OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o
>  OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o unsharp_opencl.o
>  OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o

>  OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync2.o
> +OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
> +                                                opencl/overlay.o

Missing framesync2.o?

>  OBJS-$(CONFIG_OWDENOISE_FILTER)              += vf_owdenoise.o
>  OBJS-$(CONFIG_PAD_FILTER)                    += vf_pad.o
>  OBJS-$(CONFIG_PALETTEGEN_FILTER)             += vf_palettegen.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 154ddf706d..261dd3a8e1 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -260,6 +260,7 @@ static void register_all(void)
>      REGISTER_FILTER(OCV,            ocv,            vf);
>      REGISTER_FILTER(OSCILLOSCOPE,   oscilloscope,   vf);
>      REGISTER_FILTER(OVERLAY,        overlay,        vf);
> +    REGISTER_FILTER(OVERLAY_OPENCL, overlay_opencl, vf);
>      REGISTER_FILTER(OWDENOISE,      owdenoise,      vf);
>      REGISTER_FILTER(PAD,            pad,            vf);
>      REGISTER_FILTER(PALETTEGEN,     palettegen,     vf);
> diff --git a/libavfilter/opencl/overlay.cl b/libavfilter/opencl/overlay.cl
> new file mode 100644
> index 0000000000..6cac5e28c4
> --- /dev/null
> +++ b/libavfilter/opencl/overlay.cl
> @@ -0,0 +1,104 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +__kernel void overlay_no_alpha(__write_only image2d_t dst,
> +                               __read_only  image2d_t main,
> +                               __read_only  image2d_t overlay,
> +                               int x_position,
> +                               int y_position)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +
> +    int2 overlay_size = get_image_dim(overlay);
> +    int2 loc = (int2)(get_global_id(0), get_global_id(1));
> +
> +    if (loc.x <  x_position ||
> +        loc.y <  y_position ||
> +        loc.x >= overlay_size.x + x_position ||
> +        loc.y >= overlay_size.y + y_position) {
> +        float4 val = read_imagef(main, sampler, loc);
> +        write_imagef(dst, loc, val);
> +    } else {
> +        int2 loc_overlay = (int2)(x_position, y_position);
> +        float4 val       = read_imagef(overlay, sampler, loc - loc_overlay);
> +        write_imagef(dst, loc, val);
> +    }
> +}
> +
> +__kernel void overlay_internal_alpha(__write_only image2d_t dst,
> +                                     __read_only  image2d_t main,
> +                                     __read_only  image2d_t overlay,
> +                                     int x_position,
> +                                     int y_position)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +
> +    int2 overlay_size = get_image_dim(overlay);
> +    int2 loc = (int2)(get_global_id(0), get_global_id(1));
> +
> +    if (loc.x <  x_position ||
> +        loc.y <  y_position ||
> +        loc.x >= overlay_size.x + x_position ||
> +        loc.y >= overlay_size.y + y_position) {
> +        float4 val = read_imagef(main, sampler, loc);
> +        write_imagef(dst, loc, val);
> +    } else {
> +        int2 loc_overlay  = (int2)(x_position, y_position);
> +        float4 in_main    = read_imagef(main,    sampler, loc);
> +        float4 in_overlay = read_imagef(overlay, sampler, loc - loc_overlay);
> +        float4 val        = in_overlay * in_overlay.w + in_main * (1.0f - in_overlay.w);
> +        write_imagef(dst, loc, val);
> +    }
> +}
> +
> +__kernel void overlay_external_alpha(__write_only image2d_t dst,
> +                                     __read_only  image2d_t main,
> +                                     __read_only  image2d_t overlay,
> +                                     __read_only  image2d_t alpha,
> +                                     int x_position,
> +                                     int y_position,
> +                                     int alpha_subsample_x,
> +                                     int alpha_subsample_y)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +
> +    int2 overlay_size = get_image_dim(overlay);
> +    int2 loc = (int2)(get_global_id(0), get_global_id(1));
> +
> +    if (loc.x <  x_position ||
> +        loc.y <  y_position ||
> +        loc.x >= overlay_size.x + x_position ||
> +        loc.y >= overlay_size.y + y_position) {
> +        float4 val = read_imagef(main, sampler, loc);
> +        write_imagef(dst, loc, val);
> +    } else {
> +        int2 loc_overlay  = (int2)(x_position, y_position);
> +        float4 in_main    = read_imagef(main,    sampler, loc);
> +        float4 in_overlay = read_imagef(overlay, sampler, loc - loc_overlay);
> +
> +        int2 loc_alpha    = (int2)(loc.x * alpha_subsample_x,
> +                                   loc.y * alpha_subsample_y) - loc_overlay;
> +        float4 in_alpha   = read_imagef(alpha,   sampler, loc_alpha);
> +
> +        float4 val = in_overlay * in_alpha.x + in_main * (1.0f - in_alpha.x);
> +        write_imagef(dst, loc, val);
> +    }
> +}
> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 8674a03a94..e7af58bcfa 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -19,4 +19,6 @@
>  #ifndef AVFILTER_OPENCL_SOURCE_H
>  #define AVFILTER_OPENCL_SOURCE_H
>  
> +extern const char *ff_opencl_source_overlay;
> +
>  #endif /* AVFILTER_OPENCL_SOURCE_H */
> diff --git a/libavfilter/vf_overlay_opencl.c b/libavfilter/vf_overlay_opencl.c
> new file mode 100644
> index 0000000000..4da244dd43
> --- /dev/null
> +++ b/libavfilter/vf_overlay_opencl.c
> @@ -0,0 +1,347 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/buffer.h"
> +#include "libavutil/common.h"
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_opencl.h"
> +#include "libavutil/log.h"
> +#include "libavutil/mathematics.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/opt.h"
> +
> +#include "avfilter.h"
> +#include "framesync2.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +
> +typedef struct OverlayOpenCLContext {
> +    OpenCLFilterContext ocf;
> +
> +    int              initialised;
> +    cl_kernel        kernel;
> +    cl_command_queue command_queue;
> +
> +    FFFrameSync      fs;
> +
> +    int              nb_planes;
> +    int              alpha_separate;
> +    int              alpha_subsample_x;
> +    int              alpha_subsample_y;
> +
> +    int              x_position;
> +    int              y_position;
> +} OverlayOpenCLContext;
> +
> +static int overlay_opencl_load(AVFilterContext *avctx,
> +                               enum AVPixelFormat main_format,
> +                               enum AVPixelFormat overlay_format)
> +{
> +    OverlayOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +    const char *source = ff_opencl_source_overlay;
> +    const char *kernel;
> +    const AVPixFmtDescriptor *main_desc, *overlay_desc;
> +    int err, i, main_planes, overlay_planes;
> +
> +    main_desc    = av_pix_fmt_desc_get(main_format);
> +    overlay_desc = av_pix_fmt_desc_get(overlay_format);
> +
> +    main_planes = overlay_planes = 0;
> +    for (i = 0; i < main_desc->nb_components; i++)
> +        main_planes = FFMAX(main_planes, main_desc->comp[i].plane + 1);
> +    for (i = 0; i < overlay_desc->nb_components; i++)
> +        overlay_planes = FFMAX(overlay_planes, overlay_desc->comp[i].plane + 1);
> +
> +    ctx->nb_planes = main_planes;
> +    if (main_planes == overlay_planes) {
> +        if (main_desc->nb_components == overlay_desc->nb_components)
> +            kernel = "overlay_no_alpha";
> +        else
> +            kernel = "overlay_internal_alpha";
> +        ctx->alpha_separate = 0;
> +    } else {
> +        kernel = "overlay_external_alpha";
> +        ctx->alpha_separate = 1;
> +        ctx->alpha_subsample_x = 1 << main_desc->log2_chroma_w;
> +        ctx->alpha_subsample_y = 1 << main_desc->log2_chroma_h;
> +    }
> +    av_log(avctx, AV_LOG_DEBUG, "Using kernel %s.\n", kernel);
> +
> +    err = ff_opencl_filter_load_program(avctx, &source, 1);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    if (!ctx->command_queue) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
> +               "command queue: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->kernel = clCreateKernel(ctx->ocf.program, kernel, &cle);
> +    if (!ctx->kernel) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->kernel)
> +        clReleaseKernel(ctx->kernel);
> +    return err;
> +}
> +
> +static int overlay_opencl_blend(FFFrameSync *fs)
> +{
> +    AVFilterContext    *avctx = fs->parent;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    OverlayOpenCLContext *ctx = avctx->priv;
> +    AVFrame *input_main, *input_overlay;
> +    AVFrame *output;
> +    cl_mem mem;
> +    cl_int cle, x, y;
> +    size_t global_work[2];
> +    int kernel_arg = 0;
> +    int err, plane;
> +
> +    err = ff_framesync2_get_frame(fs, 0, &input_main, 0);
> +    if (err < 0)
> +        return err;
> +    err = ff_framesync2_get_frame(fs, 1, &input_overlay, 0);
> +    if (err < 0)
> +        return err;
> +
> +    if (!ctx->initialised) {
> +        AVHWFramesContext *main_fc =
> +            (AVHWFramesContext*)input_main->hw_frames_ctx->data;
> +        AVHWFramesContext *overlay_fc =
> +            (AVHWFramesContext*)input_overlay->hw_frames_ctx->data;
> +
> +        err = overlay_opencl_load(avctx, main_fc->sw_format,
> +                                  overlay_fc->sw_format);
> +        if (err < 0)
> +            return err;
> +    }
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    for (plane = 0; plane < ctx->nb_planes; plane++) {
> +        kernel_arg = 0;
> +
> +        mem = (cl_mem)output->data[plane];
> +        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
> +        if (cle != CL_SUCCESS)
> +            goto fail_kernel_arg;
> +
> +        mem = (cl_mem)input_main->data[plane];
> +        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
> +        if (cle != CL_SUCCESS)
> +            goto fail_kernel_arg;
> +
> +        mem = (cl_mem)input_overlay->data[plane];
> +        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
> +        if (cle != CL_SUCCESS)
> +            goto fail_kernel_arg;
> +
> +        if (ctx->alpha_separate) {
> +            mem = (cl_mem)input_overlay->data[ctx->nb_planes];
> +            cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
> +            if (cle != CL_SUCCESS)
> +                goto fail_kernel_arg;
> +        }
> +
> +        x = ctx->x_position;
> +        y = ctx->y_position;
> +        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &x);
> +        if (cle != CL_SUCCESS)
> +            goto fail_kernel_arg;
> +        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &y);
> +        if (cle != CL_SUCCESS)
> +            goto fail_kernel_arg;
> +
> +        if (ctx->alpha_separate) {
> +            cl_int alpha_subsample_x = plane > 0 ? ctx->alpha_subsample_x : 1;
> +            cl_int alpha_subsample_y = plane > 0 ? ctx->alpha_subsample_y : 1;
> +
> +            cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &alpha_subsample_x);
> +            if (cle != CL_SUCCESS)
> +                goto fail_kernel_arg;
> +            cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &alpha_subsample_y);
> +            if (cle != CL_SUCCESS)
> +                goto fail_kernel_arg;
> +        }
> +
> +        global_work[0] = output->width;
> +        global_work[1] = output->height;
> +
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->kernel, 2, NULL,
> +                                     global_work, NULL, 0, NULL, NULL);
> +        if (cle != CL_SUCCESS) {
> +            av_log(avctx, AV_LOG_ERROR, "Failed to enqueue "
> +                   "overlay kernel for plane %d: %d.\n", cle, plane);
> +            err = AVERROR(EIO);
> +            goto fail;
> +        }
> +    }
> +
> +    cle = clFinish(ctx->command_queue);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to finish "
> +               "command queue: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input_main);
> +
> +    av_log(avctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail_kernel_arg:
> +    av_log(avctx, AV_LOG_ERROR, "Failed to set kernel arg %d: %d.\n",
> +           kernel_arg, cle);
> +    err = AVERROR(EIO);
> +fail:
> +    return err;
> +}
> +
> +static int overlay_opencl_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *avctx = outlink->src;
> +    OverlayOpenCLContext *ctx = avctx->priv;
> +    int err;
> +
> +    err = ff_opencl_filter_config_output(outlink);
> +    if (err < 0)
> +        return err;
> +
> +    err = ff_framesync2_init_dualinput(&ctx->fs, avctx);
> +    if (err < 0)
> +        return err;
> +
> +    return ff_framesync2_configure(&ctx->fs);
> +}
> +
> +static av_cold int overlay_opencl_init(AVFilterContext *avctx)
> +{
> +    OverlayOpenCLContext *ctx = avctx->priv;
> +
> +    ctx->fs.on_event = &overlay_opencl_blend;
> +
> +    return ff_opencl_filter_init(avctx);
> +}
> +
> +static int overlay_opencl_activate(AVFilterContext *avctx)
> +{
> +    OverlayOpenCLContext *ctx = avctx->priv;
> +
> +    return ff_framesync2_activate(&ctx->fs);
> +}
> +
> +static av_cold void overlay_opencl_uninit(AVFilterContext *avctx)
> +{
> +    OverlayOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    if (ctx->kernel) {
> +        cle = clReleaseKernel(ctx->kernel);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "kernel: %d.\n", cle);
> +    }
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +
> +    ff_framesync2_uninit(&ctx->fs);
> +}
> +
> +#define OFFSET(x) offsetof(OverlayOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption overlay_opencl_options[] = {
> +    { "x", "Overlay x position",
> +      OFFSET(x_position), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, .flags = FLAGS },
> +    { "y", "Overlay y position",
> +      OFFSET(y_position), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, .flags = FLAGS },
> +    { NULL },
> +};
> +
> +AVFILTER_DEFINE_CLASS(overlay_opencl);
> +
> +static const AVFilterPad overlay_opencl_inputs[] = {
> +    {
> +        .name         = "main",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    {
> +        .name         = "overlay",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad overlay_opencl_outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = &overlay_opencl_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_overlay_opencl = {
> +    .name            = "overlay_opencl",
> +    .description     = NULL_IF_CONFIG_SMALL("Overlay one video on top of another"),
> +    .priv_size       = sizeof(OverlayOpenCLContext),
> +    .priv_class      = &overlay_opencl_class,
> +    .init            = &overlay_opencl_init,
> +    .uninit          = &overlay_opencl_uninit,
> +    .query_formats   = &ff_opencl_filter_query_formats,
> +    .activate        = &overlay_opencl_activate,
> +    .inputs          = overlay_opencl_inputs,
> +    .outputs         = overlay_opencl_outputs,
> +    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};

Regards,
Mark Thompson Sept. 10, 2017, 9:14 p.m.
On 10/09/17 22:10, Nicolas George wrote:
> Le quartidi 24 fructidor, an CCXXV, Mark Thompson a écrit :
>> Input and output formats must be the same, the overlay format must be
>> the same as the input except possibly with an additional alpha component.
>> ---
>>  configure                       |   1 +
>>  libavfilter/Makefile            |   2 +
>>  libavfilter/allfilters.c        |   1 +
>>  libavfilter/opencl/overlay.cl   | 104 ++++++++++++
>>  libavfilter/opencl_source.h     |   2 +
>>  libavfilter/vf_overlay_opencl.c | 347 ++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 457 insertions(+)
>>  create mode 100644 libavfilter/opencl/overlay.cl
>>  create mode 100644 libavfilter/vf_overlay_opencl.c
>>
>> diff --git a/configure b/configure
>> index 895ae2ec38..c036a53a69 100755
>> --- a/configure
>> +++ b/configure
>> @@ -3178,6 +3178,7 @@ negate_filter_deps="lut_filter"
>>  nnedi_filter_deps="gpl"
>>  ocr_filter_deps="libtesseract"
>>  ocv_filter_deps="libopencv"
>> +overlay_opencl_filter_deps="opencl"
>>  owdenoise_filter_deps="gpl"
>>  pan_filter_deps="swresample"
>>  perspective_filter_deps="gpl"
>> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>> index cb3a1424d9..cc9d4021b8 100644
>> --- a/libavfilter/Makefile
>> +++ b/libavfilter/Makefile
>> @@ -249,6 +249,8 @@ OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o
>>  OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o unsharp_opencl.o
>>  OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
> 
>>  OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync2.o
>> +OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
>> +                                                opencl/overlay.o
> 
> Missing framesync2.o?

Yes; fixed locally.

Thank you!

- Mark
James Almer Sept. 10, 2017, 9:46 p.m.
On 9/10/2017 6:10 PM, Nicolas George wrote:
> Le quartidi 24 fructidor, an CCXXV, Mark Thompson a écrit :
>> Input and output formats must be the same, the overlay format must be
>> the same as the input except possibly with an additional alpha component.
>> ---
>>  configure                       |   1 +
>>  libavfilter/Makefile            |   2 +
>>  libavfilter/allfilters.c        |   1 +
>>  libavfilter/opencl/overlay.cl   | 104 ++++++++++++
>>  libavfilter/opencl_source.h     |   2 +
>>  libavfilter/vf_overlay_opencl.c | 347 ++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 457 insertions(+)
>>  create mode 100644 libavfilter/opencl/overlay.cl
>>  create mode 100644 libavfilter/vf_overlay_opencl.c
>>
>> diff --git a/configure b/configure
>> index 895ae2ec38..c036a53a69 100755
>> --- a/configure
>> +++ b/configure
>> @@ -3178,6 +3178,7 @@ negate_filter_deps="lut_filter"
>>  nnedi_filter_deps="gpl"
>>  ocr_filter_deps="libtesseract"
>>  ocv_filter_deps="libopencv"
>> +overlay_opencl_filter_deps="opencl"
>>  owdenoise_filter_deps="gpl"
>>  pan_filter_deps="swresample"
>>  perspective_filter_deps="gpl"
>> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>> index cb3a1424d9..cc9d4021b8 100644
>> --- a/libavfilter/Makefile
>> +++ b/libavfilter/Makefile
>> @@ -249,6 +249,8 @@ OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o
>>  OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o unsharp_opencl.o
>>  OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
> 
>>  OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync2.o
>> +OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
>> +                                                opencl/overlay.o
> 
> Missing framesync2.o?

Maybe it should be its own subsystem entry with the filters stating the
dependency in configure instead? It's used by enough filters by now to
justify that.
Unless of course it becomes a requirement for all filters in the long
run, where it should be an unconditional OBJS object.
Wei Gao Sept. 11, 2017, 2:04 a.m.
2017-09-11 5:46 GMT+08:00 James Almer <jamrial@gmail.com>:

> On 9/10/2017 6:10 PM, Nicolas George wrote:
> > Le quartidi 24 fructidor, an CCXXV, Mark Thompson a écrit :
> >> Input and output formats must be the same, the overlay format must be
> >> the same as the input except possibly with an additional alpha
> component.
> >> ---
> >>  configure                       |   1 +
> >>  libavfilter/Makefile            |   2 +
> >>  libavfilter/allfilters.c        |   1 +
> >>  libavfilter/opencl/overlay.cl   | 104 ++++++++++++
> >>  libavfilter/opencl_source.h     |   2 +
> >>  libavfilter/vf_overlay_opencl.c | 347 ++++++++++++++++++++++++++++++
> ++++++++++
> >>  6 files changed, 457 insertions(+)
> >>  create mode 100644 libavfilter/opencl/overlay.cl
> >>  create mode 100644 libavfilter/vf_overlay_opencl.c
> >>
> >> diff --git a/configure b/configure
> >> index 895ae2ec38..c036a53a69 100755
> >> --- a/configure
> >> +++ b/configure
> >> @@ -3178,6 +3178,7 @@ negate_filter_deps="lut_filter"
> >>  nnedi_filter_deps="gpl"
> >>  ocr_filter_deps="libtesseract"
> >>  ocv_filter_deps="libopencv"
> >> +overlay_opencl_filter_deps="opencl"
> >>  owdenoise_filter_deps="gpl"
> >>  pan_filter_deps="swresample"
> >>  perspective_filter_deps="gpl"
> >> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> >> index cb3a1424d9..cc9d4021b8 100644
> >> --- a/libavfilter/Makefile
> >> +++ b/libavfilter/Makefile
> >> @@ -249,6 +249,8 @@ OBJS-$(CONFIG_OCV_FILTER)                    +=
> vf_libopencv.o
> >>  OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o
> unsharp_opencl.o
> >>  OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
> >
> >>  OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o
> framesync2.o
> >> +OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o
> opencl.o \
> >> +                                                opencl/overlay.o
> >
> > Missing framesync2.o?
>
> Maybe it should be its own subsystem entry with the filters stating the
> dependency in configure instead? It's used by enough filters by now to
> justify that.
> Unless of course it becomes a requirement for all filters in the long
> run, where it should be an unconditional OBJS object.
>
looks good to me

> _____________________________________________
Mark Thompson Sept. 11, 2017, 9:34 a.m.
On 10/09/17 21:53, Mark Thompson wrote:
> Input and output formats must be the same, the overlay format must be
> the same as the input except possibly with an additional alpha component.
> ---
>  configure                       |   1 +
>  libavfilter/Makefile            |   2 +
>  libavfilter/allfilters.c        |   1 +
>  libavfilter/opencl/overlay.cl   | 104 ++++++++++++
>  libavfilter/opencl_source.h     |   2 +
>  libavfilter/vf_overlay_opencl.c | 347 ++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 457 insertions(+)
>  create mode 100644 libavfilter/opencl/overlay.cl
>  create mode 100644 libavfilter/vf_overlay_opencl.c

The positioning is broken in this version for subsampled planes placed at (x, y) != 0.  Will fix.

- Mark
Nicolas George Sept. 11, 2017, 4:59 p.m.
Le quartidi 24 fructidor, an CCXXV, James Almer a écrit :
> Maybe it should be its own subsystem entry with the filters stating the
> dependency in configure instead? It's used by enough filters by now to
> justify that.

I do not oppose, but what would be the benefit?

Regards,
James Almer Sept. 11, 2017, 5:23 p.m.
On 9/11/2017 1:59 PM, Nicolas George wrote:
> Le quartidi 24 fructidor, an CCXXV, James Almer a écrit :
>> Maybe it should be its own subsystem entry with the filters stating the
>> dependency in configure instead? It's used by enough filters by now to
>> justify that.
> 
> I do not oppose, but what would be the benefit?

Cleaner Makefile, removing a lot of repeated objects on every filter
dependency entry.
Same reason why so many "Subsystem" entries exist in libavcodec.

Is the idea that every filter will eventually use framesync2 in the long
run, or will it not be used by some? If the latter I'll cook up a patch
to make framesync2 a lavfi configure time dependency. Otherwise lets
just add it to the unconditional OBJS list.
Nicolas George Sept. 11, 2017, 5:29 p.m.
Le quintidi 25 fructidor, an CCXXV, James Almer a écrit :
> Cleaner Makefile, removing a lot of repeated objects on every filter
> dependency entry.

If I understand correctly, it only moves the complexity from Makefile to
configure. But configure and shell scripts are for what cannot be
achieved simply with make; this can. For example, if a subsystem is made
of several files, having configure enable all of them is simpler than
having all the files as a dependency. But in this case there is only
one.

> Is the idea that every filter will eventually use framesync2 in the long
> run, or will it not be used by some?

framesync(2) is for filters that have several video inputs that needs to
get synchronous frames on them. So clearly not all filters.

Regards,
James Almer Sept. 11, 2017, 6:18 p.m.
On 9/11/2017 2:29 PM, Nicolas George wrote:
> Le quintidi 25 fructidor, an CCXXV, James Almer a écrit :
>> Cleaner Makefile, removing a lot of repeated objects on every filter
>> dependency entry.
> 
> If I understand correctly, it only moves the complexity from Makefile to
> configure. But configure and shell scripts are for what cannot be
> achieved simply with make; this can. For example, if a subsystem is made
> of several files, having configure enable all of them is simpler than
> having all the files as a dependency. But in this case there is only
> one.

Yes, it moves things to configure where many filters already have
assorted non lavfi dependencies listed there (fft, etc), so the end
result of having the Makefile listing only the filter specific file(s)
for each OBJS entry is IMO worth the change.

> 
>> Is the idea that every filter will eventually use framesync2 in the long
>> run, or will it not be used by some?
> 
> framesync(2) is for filters that have several video inputs that needs to
> get synchronous frames on them. So clearly not all filters.

Ok. Since you have a patchset removing framesync and renaming framesync2
I'll wait until that's committed before sending the aforementioned change.
Nicolas George Sept. 12, 2017, 9:11 a.m.
Le quintidi 25 fructidor, an CCXXV, James Almer a écrit :
> Yes, it moves things to configure where many filters already have
> assorted non lavfi dependencies listed there (fft, etc), so the end
> result of having the Makefile listing only the filter specific file(s)
> for each OBJS entry is IMO worth the change.

It still like moving things around for the sake of moving things around
without an actual benefit. For reference, what started this discussion
was a patch forgetting the dependency: it would have been forgotten
anyway. Still, it does not make things worse either, so I do not argue.

> Ok. Since you have a patchset removing framesync and renaming framesync2
> I'll wait until that's committed before sending the aforementioned change.

I just pushed it.

Regards,

Patch hide | download patch | download mbox

diff --git a/configure b/configure
index 895ae2ec38..c036a53a69 100755
--- a/configure
+++ b/configure
@@ -3178,6 +3178,7 @@  negate_filter_deps="lut_filter"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
+overlay_opencl_filter_deps="opencl"
 owdenoise_filter_deps="gpl"
 pan_filter_deps="swresample"
 perspective_filter_deps="gpl"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index cb3a1424d9..cc9d4021b8 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -249,6 +249,8 @@  OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o
 OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o unsharp_opencl.o
 OBJS-$(CONFIG_OSCILLOSCOPE_FILTER)           += vf_datascope.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += vf_overlay.o framesync2.o
+OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
+                                                opencl/overlay.o
 OBJS-$(CONFIG_OWDENOISE_FILTER)              += vf_owdenoise.o
 OBJS-$(CONFIG_PAD_FILTER)                    += vf_pad.o
 OBJS-$(CONFIG_PALETTEGEN_FILTER)             += vf_palettegen.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 154ddf706d..261dd3a8e1 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -260,6 +260,7 @@  static void register_all(void)
     REGISTER_FILTER(OCV,            ocv,            vf);
     REGISTER_FILTER(OSCILLOSCOPE,   oscilloscope,   vf);
     REGISTER_FILTER(OVERLAY,        overlay,        vf);
+    REGISTER_FILTER(OVERLAY_OPENCL, overlay_opencl, vf);
     REGISTER_FILTER(OWDENOISE,      owdenoise,      vf);
     REGISTER_FILTER(PAD,            pad,            vf);
     REGISTER_FILTER(PALETTEGEN,     palettegen,     vf);
diff --git a/libavfilter/opencl/overlay.cl b/libavfilter/opencl/overlay.cl
new file mode 100644
index 0000000000..6cac5e28c4
--- /dev/null
+++ b/libavfilter/opencl/overlay.cl
@@ -0,0 +1,104 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+__kernel void overlay_no_alpha(__write_only image2d_t dst,
+                               __read_only  image2d_t main,
+                               __read_only  image2d_t overlay,
+                               int x_position,
+                               int y_position)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+
+    int2 overlay_size = get_image_dim(overlay);
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+
+    if (loc.x <  x_position ||
+        loc.y <  y_position ||
+        loc.x >= overlay_size.x + x_position ||
+        loc.y >= overlay_size.y + y_position) {
+        float4 val = read_imagef(main, sampler, loc);
+        write_imagef(dst, loc, val);
+    } else {
+        int2 loc_overlay = (int2)(x_position, y_position);
+        float4 val       = read_imagef(overlay, sampler, loc - loc_overlay);
+        write_imagef(dst, loc, val);
+    }
+}
+
+__kernel void overlay_internal_alpha(__write_only image2d_t dst,
+                                     __read_only  image2d_t main,
+                                     __read_only  image2d_t overlay,
+                                     int x_position,
+                                     int y_position)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+
+    int2 overlay_size = get_image_dim(overlay);
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+
+    if (loc.x <  x_position ||
+        loc.y <  y_position ||
+        loc.x >= overlay_size.x + x_position ||
+        loc.y >= overlay_size.y + y_position) {
+        float4 val = read_imagef(main, sampler, loc);
+        write_imagef(dst, loc, val);
+    } else {
+        int2 loc_overlay  = (int2)(x_position, y_position);
+        float4 in_main    = read_imagef(main,    sampler, loc);
+        float4 in_overlay = read_imagef(overlay, sampler, loc - loc_overlay);
+        float4 val        = in_overlay * in_overlay.w + in_main * (1.0f - in_overlay.w);
+        write_imagef(dst, loc, val);
+    }
+}
+
+__kernel void overlay_external_alpha(__write_only image2d_t dst,
+                                     __read_only  image2d_t main,
+                                     __read_only  image2d_t overlay,
+                                     __read_only  image2d_t alpha,
+                                     int x_position,
+                                     int y_position,
+                                     int alpha_subsample_x,
+                                     int alpha_subsample_y)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+
+    int2 overlay_size = get_image_dim(overlay);
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+
+    if (loc.x <  x_position ||
+        loc.y <  y_position ||
+        loc.x >= overlay_size.x + x_position ||
+        loc.y >= overlay_size.y + y_position) {
+        float4 val = read_imagef(main, sampler, loc);
+        write_imagef(dst, loc, val);
+    } else {
+        int2 loc_overlay  = (int2)(x_position, y_position);
+        float4 in_main    = read_imagef(main,    sampler, loc);
+        float4 in_overlay = read_imagef(overlay, sampler, loc - loc_overlay);
+
+        int2 loc_alpha    = (int2)(loc.x * alpha_subsample_x,
+                                   loc.y * alpha_subsample_y) - loc_overlay;
+        float4 in_alpha   = read_imagef(alpha,   sampler, loc_alpha);
+
+        float4 val = in_overlay * in_alpha.x + in_main * (1.0f - in_alpha.x);
+        write_imagef(dst, loc, val);
+    }
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 8674a03a94..e7af58bcfa 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -19,4 +19,6 @@ 
 #ifndef AVFILTER_OPENCL_SOURCE_H
 #define AVFILTER_OPENCL_SOURCE_H
 
+extern const char *ff_opencl_source_overlay;
+
 #endif /* AVFILTER_OPENCL_SOURCE_H */
diff --git a/libavfilter/vf_overlay_opencl.c b/libavfilter/vf_overlay_opencl.c
new file mode 100644
index 0000000000..4da244dd43
--- /dev/null
+++ b/libavfilter/vf_overlay_opencl.c
@@ -0,0 +1,347 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/buffer.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_opencl.h"
+#include "libavutil/log.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+
+#include "avfilter.h"
+#include "framesync2.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "video.h"
+
+typedef struct OverlayOpenCLContext {
+    OpenCLFilterContext ocf;
+
+    int              initialised;
+    cl_kernel        kernel;
+    cl_command_queue command_queue;
+
+    FFFrameSync      fs;
+
+    int              nb_planes;
+    int              alpha_separate;
+    int              alpha_subsample_x;
+    int              alpha_subsample_y;
+
+    int              x_position;
+    int              y_position;
+} OverlayOpenCLContext;
+
+static int overlay_opencl_load(AVFilterContext *avctx,
+                               enum AVPixelFormat main_format,
+                               enum AVPixelFormat overlay_format)
+{
+    OverlayOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+    const char *source = ff_opencl_source_overlay;
+    const char *kernel;
+    const AVPixFmtDescriptor *main_desc, *overlay_desc;
+    int err, i, main_planes, overlay_planes;
+
+    main_desc    = av_pix_fmt_desc_get(main_format);
+    overlay_desc = av_pix_fmt_desc_get(overlay_format);
+
+    main_planes = overlay_planes = 0;
+    for (i = 0; i < main_desc->nb_components; i++)
+        main_planes = FFMAX(main_planes, main_desc->comp[i].plane + 1);
+    for (i = 0; i < overlay_desc->nb_components; i++)
+        overlay_planes = FFMAX(overlay_planes, overlay_desc->comp[i].plane + 1);
+
+    ctx->nb_planes = main_planes;
+    if (main_planes == overlay_planes) {
+        if (main_desc->nb_components == overlay_desc->nb_components)
+            kernel = "overlay_no_alpha";
+        else
+            kernel = "overlay_internal_alpha";
+        ctx->alpha_separate = 0;
+    } else {
+        kernel = "overlay_external_alpha";
+        ctx->alpha_separate = 1;
+        ctx->alpha_subsample_x = 1 << main_desc->log2_chroma_w;
+        ctx->alpha_subsample_y = 1 << main_desc->log2_chroma_h;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Using kernel %s.\n", kernel);
+
+    err = ff_opencl_filter_load_program(avctx, &source, 1);
+    if (err < 0)
+        goto fail;
+
+    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+                                              ctx->ocf.hwctx->device_id,
+                                              0, &cle);
+    if (!ctx->command_queue) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
+               "command queue: %d.\n", cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    ctx->kernel = clCreateKernel(ctx->ocf.program, kernel, &cle);
+    if (!ctx->kernel) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    ctx->initialised = 1;
+    return 0;
+
+fail:
+    if (ctx->command_queue)
+        clReleaseCommandQueue(ctx->command_queue);
+    if (ctx->kernel)
+        clReleaseKernel(ctx->kernel);
+    return err;
+}
+
+static int overlay_opencl_blend(FFFrameSync *fs)
+{
+    AVFilterContext    *avctx = fs->parent;
+    AVFilterLink     *outlink = avctx->outputs[0];
+    OverlayOpenCLContext *ctx = avctx->priv;
+    AVFrame *input_main, *input_overlay;
+    AVFrame *output;
+    cl_mem mem;
+    cl_int cle, x, y;
+    size_t global_work[2];
+    int kernel_arg = 0;
+    int err, plane;
+
+    err = ff_framesync2_get_frame(fs, 0, &input_main, 0);
+    if (err < 0)
+        return err;
+    err = ff_framesync2_get_frame(fs, 1, &input_overlay, 0);
+    if (err < 0)
+        return err;
+
+    if (!ctx->initialised) {
+        AVHWFramesContext *main_fc =
+            (AVHWFramesContext*)input_main->hw_frames_ctx->data;
+        AVHWFramesContext *overlay_fc =
+            (AVHWFramesContext*)input_overlay->hw_frames_ctx->data;
+
+        err = overlay_opencl_load(avctx, main_fc->sw_format,
+                                  overlay_fc->sw_format);
+        if (err < 0)
+            return err;
+    }
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (plane = 0; plane < ctx->nb_planes; plane++) {
+        kernel_arg = 0;
+
+        mem = (cl_mem)output->data[plane];
+        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
+        if (cle != CL_SUCCESS)
+            goto fail_kernel_arg;
+
+        mem = (cl_mem)input_main->data[plane];
+        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
+        if (cle != CL_SUCCESS)
+            goto fail_kernel_arg;
+
+        mem = (cl_mem)input_overlay->data[plane];
+        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
+        if (cle != CL_SUCCESS)
+            goto fail_kernel_arg;
+
+        if (ctx->alpha_separate) {
+            mem = (cl_mem)input_overlay->data[ctx->nb_planes];
+            cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_mem), &mem);
+            if (cle != CL_SUCCESS)
+                goto fail_kernel_arg;
+        }
+
+        x = ctx->x_position;
+        y = ctx->y_position;
+        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &x);
+        if (cle != CL_SUCCESS)
+            goto fail_kernel_arg;
+        cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &y);
+        if (cle != CL_SUCCESS)
+            goto fail_kernel_arg;
+
+        if (ctx->alpha_separate) {
+            cl_int alpha_subsample_x = plane > 0 ? ctx->alpha_subsample_x : 1;
+            cl_int alpha_subsample_y = plane > 0 ? ctx->alpha_subsample_y : 1;
+
+            cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &alpha_subsample_x);
+            if (cle != CL_SUCCESS)
+                goto fail_kernel_arg;
+            cle = clSetKernelArg(ctx->kernel, kernel_arg++, sizeof(cl_int), &alpha_subsample_y);
+            if (cle != CL_SUCCESS)
+                goto fail_kernel_arg;
+        }
+
+        global_work[0] = output->width;
+        global_work[1] = output->height;
+
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->kernel, 2, NULL,
+                                     global_work, NULL, 0, NULL, NULL);
+        if (cle != CL_SUCCESS) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to enqueue "
+                   "overlay kernel for plane %d: %d.\n", cle, plane);
+            err = AVERROR(EIO);
+            goto fail;
+        }
+    }
+
+    cle = clFinish(ctx->command_queue);
+    if (cle != CL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to finish "
+               "command queue: %d.\n", cle);
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    err = av_frame_copy_props(output, input_main);
+
+    av_log(avctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
+           av_get_pix_fmt_name(output->format),
+           output->width, output->height, output->pts);
+
+    return ff_filter_frame(outlink, output);
+
+fail_kernel_arg:
+    av_log(avctx, AV_LOG_ERROR, "Failed to set kernel arg %d: %d.\n",
+           kernel_arg, cle);
+    err = AVERROR(EIO);
+fail:
+    return err;
+}
+
+static int overlay_opencl_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *avctx = outlink->src;
+    OverlayOpenCLContext *ctx = avctx->priv;
+    int err;
+
+    err = ff_opencl_filter_config_output(outlink);
+    if (err < 0)
+        return err;
+
+    err = ff_framesync2_init_dualinput(&ctx->fs, avctx);
+    if (err < 0)
+        return err;
+
+    return ff_framesync2_configure(&ctx->fs);
+}
+
+static av_cold int overlay_opencl_init(AVFilterContext *avctx)
+{
+    OverlayOpenCLContext *ctx = avctx->priv;
+
+    ctx->fs.on_event = &overlay_opencl_blend;
+
+    return ff_opencl_filter_init(avctx);
+}
+
+static int overlay_opencl_activate(AVFilterContext *avctx)
+{
+    OverlayOpenCLContext *ctx = avctx->priv;
+
+    return ff_framesync2_activate(&ctx->fs);
+}
+
+static av_cold void overlay_opencl_uninit(AVFilterContext *avctx)
+{
+    OverlayOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+
+    if (ctx->kernel) {
+        cle = clReleaseKernel(ctx->kernel);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "kernel: %d.\n", cle);
+    }
+
+    if (ctx->command_queue) {
+        cle = clReleaseCommandQueue(ctx->command_queue);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "command queue: %d.\n", cle);
+    }
+
+    ff_opencl_filter_uninit(avctx);
+
+    ff_framesync2_uninit(&ctx->fs);
+}
+
+#define OFFSET(x) offsetof(OverlayOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption overlay_opencl_options[] = {
+    { "x", "Overlay x position",
+      OFFSET(x_position), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, .flags = FLAGS },
+    { "y", "Overlay y position",
+      OFFSET(y_position), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, .flags = FLAGS },
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(overlay_opencl);
+
+static const AVFilterPad overlay_opencl_inputs[] = {
+    {
+        .name         = "main",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = &ff_opencl_filter_config_input,
+    },
+    {
+        .name         = "overlay",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = &ff_opencl_filter_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad overlay_opencl_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = &overlay_opencl_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_overlay_opencl = {
+    .name            = "overlay_opencl",
+    .description     = NULL_IF_CONFIG_SMALL("Overlay one video on top of another"),
+    .priv_size       = sizeof(OverlayOpenCLContext),
+    .priv_class      = &overlay_opencl_class,
+    .init            = &overlay_opencl_init,
+    .uninit          = &overlay_opencl_uninit,
+    .query_formats   = &ff_opencl_filter_query_formats,
+    .activate        = &overlay_opencl_activate,
+    .inputs          = overlay_opencl_inputs,
+    .outputs         = overlay_opencl_outputs,
+    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
+};