[FFmpeg-devel] avfilter: add xfade opencl filter

Message ID	20200124104656.26372-1-onemda@gmail.com
State	Superseded
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> From: Paul B Mahol <onemda@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Fri, 24 Jan 2020 11:46:56 +0100 Message-Id: <20200124104656.26372-1-onemda@gmail.com> Subject: [FFmpeg-devel] [PATCH] avfilter: add xfade opencl filter Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel] avfilter: add xfade opencl filter \| expand [FFmpeg-devel] avfilter: add xfade opencl filter

Message ID

20200124104656.26372-1-onemda@gmail.com

State

Superseded

Headers

From: Paul B Mahol <onemda@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Fri, 24 Jan 2020 11:46:56 +0100
Message-Id: <20200124104656.26372-1-onemda@gmail.com>
Subject: [FFmpeg-devel] [PATCH] avfilter: add xfade opencl filter
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel] avfilter: add xfade opencl filter | expand

Context	Check	Description
andriy/ffmpeg-patchwork	success	Make fate finished

Context

Check

Description

andriy/ffmpeg-patchwork

success

Make fate finished

Commit Message

Paul B Mahol Jan. 24, 2020, 10:46 a.m. UTC

Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 configure                     |   1 +
 doc/filters.texi              |  97 ++++++++
 libavfilter/Makefile          |   1 +
 libavfilter/allfilters.c      |   1 +
 libavfilter/opencl/xfade.cl   | 150 ++++++++++++
 libavfilter/opencl_source.h   |   1 +
 libavfilter/vf_xfade_opencl.c | 427 ++++++++++++++++++++++++++++++++++
 7 files changed, 678 insertions(+)
 create mode 100644 libavfilter/opencl/xfade.cl
 create mode 100644 libavfilter/vf_xfade_opencl.c

Comments

Mark Thompson Jan. 26, 2020, 3:33 p.m. UTC | #1

On 24/01/2020 10:46, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>  configure                     |   1 +
>  doc/filters.texi              |  97 ++++++++
>  libavfilter/Makefile          |   1 +
>  libavfilter/allfilters.c      |   1 +
>  libavfilter/opencl/xfade.cl   | 150 ++++++++++++
>  libavfilter/opencl_source.h   |   1 +
>  libavfilter/vf_xfade_opencl.c | 427 ++++++++++++++++++++++++++++++++++
>  7 files changed, 678 insertions(+)
>  create mode 100644 libavfilter/opencl/xfade.cl
>  create mode 100644 libavfilter/vf_xfade_opencl.c
> 
> diff --git a/configure b/configure
> index 1f3d0fdd4b..7da9f486b8 100755
> --- a/configure
> +++ b/configure
> @@ -3595,6 +3595,7 @@ zscale_filter_deps="libzimg const_nan"
>  scale_vaapi_filter_deps="vaapi"
>  vpp_qsv_filter_deps="libmfx"
>  vpp_qsv_filter_select="qsvvpp"
> +xfade_opencl_filter_deps="opencl"
>  yadif_cuda_filter_deps="ffnvcodec"
>  yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>  
> diff --git a/doc/filters.texi b/doc/filters.texi
> index a9ae75f0c0..1b787c51d7 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -21341,6 +21341,103 @@ Apply a strong blur of both luma and chroma parameters:
>  @end example
>  @end itemize
>  
> +@section xfade_opencl
> +
> +Cross fade two videos with custom transition effect by using OpenCL.
> +
> +It accepts the following options:
> +
> +@table @option
> +@item transition
> +Set one of possible transition effects.
> +
> +@table @option
> +@item custom
> +Select custom transition effect, the actual transition description
> +will be picked from source and kernel options.
> +
> +@item fade
> +@item wipeleft
> +@item wiperight
> +@item wipeup
> +@item wipedown
> +@item slideleft
> +@item slideright
> +@item slideup
> +@item slidedown
> +
> +Default transtition is fade.
> +@end table
> +
> +@item source
> +OpenCL program source file for custom transition.
> +
> +@item kernel
> +Set name of kernel to use for custom transition from program source file.
> +
> +@item duration
> +Set duration of video transition.
> +
> +@item offset
> +Set time of start of transition relative to first video.
> +@end table
> +
> +The program source file must contain a kernel function with the given name,
> +which will be run once for each plane of the output.  Each run on a plane
> +gets enqueued as a separate 2D global NDRange with one work-item for each
> +pixel to be generated.  The global ID offset for each work-item is therefore
> +the coordinates of a pixel in the destination image.
> +
> +The kernel function needs to take the following arguments:
> +@itemize
> +@item
> +Destination image, @var{__write_only image2d_t}.
> +
> +This image will become the output; the kernel should write all of it.
> +
> +@item
> +First Source image, @var{__read_only image2d_t}.
> +Second Source image, @var{__read_only image2d_t}.
> +
> +These are the most recent images on each input.  The kernel may read from
> +them to generate the output, but they can't be written to.
> +
> +@item
> +Transition progress, @var{float}. This value is always between 0 and 1 inclusive.
> +@end itemize
> +
> +Example programs:
> +
> +@itemize
> +@item
> +Apply dots curtain transition effect:
> +@verbatim
> +__kernel void blend_images(__write_only image2d_t dst,
> +                           __read_only  image2d_t src1,
> +                           __read_only  image2d_t src2,
> +                           float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_LINEAR);
> +    int2  p = (int2)(get_global_id(0), get_global_id(1));
> +    float2 rp = (float2)(get_global_id(0), get_global_id(1));
> +    float2 dim = (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
> +    rp = rp / dim;
> +
> +    float2 dots = (float2)(20.0, 20.0);
> +    float2 center = (float2)(0,0);
> +    float2 unused;
> +
> +    float4 val1 = read_imagef(src1, sampler, p);
> +    float4 val2 = read_imagef(src2, sampler, p);
> +    bool next = distance(fract(rp * dots, &unused), (float2)(0.5, 0.5)) < (progress / distance(rp, center));
> +
> +    write_imagef(dst, p, next ? val1 : val2);> +}

Nice example :)

> +@end verbatim
> +
> +@end itemize
> +
>  @c man end OPENCL VIDEO FILTERS
>  
>  @chapter VAAPI Video Filters
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 58b3077dec..a5ee9c8e88 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -441,6 +441,7 @@ OBJS-$(CONFIG_W3FDIF_FILTER)                 += vf_w3fdif.o
>  OBJS-$(CONFIG_WAVEFORM_FILTER)               += vf_waveform.o
>  OBJS-$(CONFIG_WEAVE_FILTER)                  += vf_weave.o
>  OBJS-$(CONFIG_XBR_FILTER)                    += vf_xbr.o
> +OBJS-$(CONFIG_XFADE_OPENCL_FILTER)           += vf_xfade_opencl.o opencl.o opencl/xfade.o
>  OBJS-$(CONFIG_XMEDIAN_FILTER)                += vf_xmedian.o framesync.o
>  OBJS-$(CONFIG_XSTACK_FILTER)                 += vf_stack.o framesync.o
>  OBJS-$(CONFIG_YADIF_FILTER)                  += vf_yadif.o yadif_common.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 6270c18ae2..8a7eac3757 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -420,6 +420,7 @@ extern AVFilter ff_vf_w3fdif;
>  extern AVFilter ff_vf_waveform;
>  extern AVFilter ff_vf_weave;
>  extern AVFilter ff_vf_xbr;
> +extern AVFilter ff_vf_xfade_opencl;
>  extern AVFilter ff_vf_xmedian;
>  extern AVFilter ff_vf_xstack;
>  extern AVFilter ff_vf_yadif;
> diff --git a/libavfilter/opencl/xfade.cl b/libavfilter/opencl/xfade.cl
> new file mode 100644
> index 0000000000..9b5bdb5e29
> --- /dev/null
> +++ b/libavfilter/opencl/xfade.cl
> @@ -0,0 +1,150 @@
> +__kernel void fade(__write_only image2d_t dst,
> +                   __read_only  image2d_t src1,
> +                   __read_only  image2d_t src2,
> +                   float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int2  p = (int2)(get_global_id(0), get_global_id(1));
> +
> +    float4 val1 = read_imagef(src1, sampler, p);
> +    float4 val2 = read_imagef(src2, sampler, p);
> +
> +    write_imagef(dst, p, val1 * progress + val2 * (1.f - progress));
> +}
> +
> +__kernel void wipeleft(__write_only image2d_t dst,
> +                       __read_only  image2d_t src1,
> +                       __read_only  image2d_t src2,
> +                       float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int2  p = (int2)(get_global_id(0), get_global_id(1));
> +    int   s = (int)(get_image_dim(src1).x * progress);
> +
> +    float4 val1 = read_imagef(src1, sampler, p);
> +    float4 val2 = read_imagef(src2, sampler, p);
> +
> +    write_imagef(dst, p, p.x > s ? val2 : val1);

For formats with subsampling, the selection of a pixel from one image or the other has slightly unfortunate effects on the boundary.  If the boundary happens in the middle of a 2x2 block in yuv420p, then the luma pixels will be right but the chroma will match only one side of the boundary.  That's visible in some transitions like this one as the boundary line changing between two colours as it moves depending on whether it happens to land on an odd or an even pixel.

Is there anything you can do about that?  (Possibly averaging chroma values from the two sides if you land in the middle of a subsampled pixel could work.)

This applies to all the wipe transitions, and is also visible as funny colours on the edges of the dots in the custom example.

> +}
> +
> +__kernel void wiperight(__write_only image2d_t dst,
> +                        __read_only  image2d_t src1,
> +                        __read_only  image2d_t src2,
> +                        float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int2  p = (int2)(get_global_id(0), get_global_id(1));
> +    int   s = (int)(get_image_dim(src1).x * (1.f - progress));
> +
> +    float4 val1 = read_imagef(src1, sampler, p);
> +    float4 val2 = read_imagef(src2, sampler, p);
> +
> +    write_imagef(dst, p, p.x > s ? val1 : val2);
> +}
> +
> +__kernel void wipeup(__write_only image2d_t dst,
> +                     __read_only  image2d_t src1,
> +                     __read_only  image2d_t src2,
> +                     float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int2  p = (int2)(get_global_id(0), get_global_id(1));
> +    int   s = (int)(get_image_dim(src1).y * progress);
> +
> +    float4 val1 = read_imagef(src1, sampler, p);
> +    float4 val2 = read_imagef(src2, sampler, p);
> +
> +    write_imagef(dst, p, p.y > s ? val2 : val1);
> +}
> +
> +__kernel void wipedown(__write_only image2d_t dst,
> +                       __read_only  image2d_t src1,
> +                       __read_only  image2d_t src2,
> +                       float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int2  p = (int2)(get_global_id(0), get_global_id(1));
> +    int   s = (int)(get_image_dim(src1).y * (1.f - progress));
> +
> +    float4 val1 = read_imagef(src1, sampler, p);
> +    float4 val2 = read_imagef(src2, sampler, p);
> +
> +    write_imagef(dst, p, p.y > s ? val1 : val2);
> +}

Unless I've missed something, the only difference in each of these is the choice of x/y and </> in the comparison at the end.  Can you macro that to avoid quadruplicating everything else?

> +
> +__kernel void slidedown(__write_only image2d_t dst,
> +                        __read_only  image2d_t src1,
> +                        __read_only  image2d_t src2,
> +                        float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
> +                               CLK_FILTER_LINEAR);
> +    float2 direction = (float2)(0.0, 1.0);

Surprise double-precision constants.

> +    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
> +    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
> +    float2 p = uv + progress * sign(direction);
> +    float2 unused;
> +    float2 f = fract(p, &unused);
> +    float4 val1 = read_imagef(src1, sampler, f);
> +    float4 val2 = read_imagef(src2, sampler, f);

These go out of range and read values off the bottom of the image - normalised coordinates for images are in the range [0.0 .. (height-1)/height], not [0.0, 1.0].

It was visible as black/green on the boundary in my testing (uninitialised data happening to be zero, I guess).

> +    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
> +}> +
> +__kernel void slideup(__write_only image2d_t dst,
> +                      __read_only  image2d_t src1,
> +                      __read_only  image2d_t src2,
> +                      float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
> +                               CLK_FILTER_LINEAR);
> +    float2 direction = (float2)(0.0, -1.0);
> +    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
> +    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
> +    float2 p = uv + progress * sign(direction);
> +    float2 unused;
> +    float2 f = fract(p, &unused);
> +    float4 val1 = read_imagef(src1, sampler, f);
> +    float4 val2 = read_imagef(src2, sampler, f);
> +    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
> +}
> +
> +__kernel void slideleft(__write_only image2d_t dst,
> +                        __read_only  image2d_t src1,
> +                        __read_only  image2d_t src2,
> +                        float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
> +                               CLK_FILTER_LINEAR);
> +    float2 direction = (float2)(-1.0, 0.0);
> +    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
> +    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
> +    float2 p = uv + progress * sign(direction);
> +    float2 unused;
> +    float2 f = fract(p, &unused);
> +    float4 val1 = read_imagef(src1, sampler, f);
> +    float4 val2 = read_imagef(src2, sampler, f);
> +    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
> +}
> +
> +__kernel void slideright(__write_only image2d_t dst,
> +                         __read_only  image2d_t src1,
> +                         __read_only  image2d_t src2,
> +                         float progress)
> +{
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
> +                               CLK_FILTER_LINEAR);
> +    float2 direction = (float2)(1.0, 0.0);
> +    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
> +    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
> +    float2 p = uv + progress * sign(direction);
> +    float2 unused;
> +    float2 f = fract(p, &unused);
> +    float4 val1 = read_imagef(src1, sampler, f);
> +    float4 val2 = read_imagef(src2, sampler, f);
> +    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
> +}

Same comment about the kernels all being mostly the same - just the direction value?

> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 225e7a49ea..4e262672ad 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -30,5 +30,6 @@ extern const char *ff_opencl_source_overlay;
>  extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_transpose;
>  extern const char *ff_opencl_source_unsharp;
> +extern const char *ff_opencl_source_xfade;
>  
>  #endif /* AVFILTER_OPENCL_SOURCE_H */
> diff --git a/libavfilter/vf_xfade_opencl.c b/libavfilter/vf_xfade_opencl.c
> new file mode 100644
> index 0000000000..83a45ec4bd
> --- /dev/null
> +++ b/libavfilter/vf_xfade_opencl.c
> @@ -0,0 +1,427 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/log.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "filters.h"
> +#include "framesync.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +
> +enum XFadeTransitions {
> +    CUSTOM,
> +    FADE,
> +    WIPELEFT,
> +    WIPERIGHT,
> +    WIPEUP,
> +    WIPEDOWN,
> +    SLIDELEFT,
> +    SLIDERIGHT,
> +    SLIDEUP,
> +    SLIDEDOWN,
> +    NB_TRANSITIONS,
> +};
> +
> +typedef struct XFadeOpenCLContext {
> +    OpenCLFilterContext ocf;
> +
> +    int              transition;
> +    const char      *source_file;
> +    const char      *kernel_name;
> +    int64_t          duration;
> +    int64_t          offset;
> +
> +    int              initialised;
> +    cl_kernel        kernel;
> +    cl_command_queue command_queue;
> +
> +    int              nb_planes;
> +
> +    int64_t          duration_pts;
> +    int64_t          offset_pts;
> +    int64_t          first_pts;
> +    int64_t          pts;
> +    int              xfade_is_over;
> +    int              need_second;
> +    int              eof[2];
> +    AVFrame         *xf[2];
> +} XFadeOpenCLContext;
> +
> +static int xfade_opencl_load(AVFilterContext *avctx,
> +                             enum AVPixelFormat main_format,
> +                             enum AVPixelFormat xfade_format)
> +{
> +    XFadeOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +    const AVPixFmtDescriptor *main_desc, *xfade_desc;
> +    int err, i, main_planes, xfade_planes;
> +    const char *kernel_name;
> +
> +    ctx->ocf.output_width  = avctx->inputs[0]->w;
> +    ctx->ocf.output_height = avctx->inputs[0]->h;

These will already have been set by config_input(), which takes exactly these as the default.

> +    ctx->ocf.output_format = avctx->inputs[0]->format;

Will be AV_PIX_FMT_OPENCL which isn't what you want (but it isn't used after this point anyway).

> +
> +    main_desc  = av_pix_fmt_desc_get(main_format);
> +    xfade_desc = av_pix_fmt_desc_get(xfade_format);

For non-custom transitions the formats do have to be identical.  It seems worth checking that.

> +
> +    main_planes = xfade_planes = 0;
> +    for (i = 0; i < main_desc->nb_components; i++)
> +        main_planes = FFMAX(main_planes,
> +                            main_desc->comp[i].plane + 1);
> +    for (i = 0; i < xfade_desc->nb_components; i++)
> +        xfade_planes = FFMAX(xfade_planes,
> +                             xfade_desc->comp[i].plane + 1);
> +
> +    ctx->nb_planes = main_planes;

Bad things will happen if xfade_planes != main_planes.

> +
> +    if (ctx->transition == CUSTOM) {
> +        err = ff_opencl_filter_load_program_from_file(avctx, ctx->source_file);
> +        if (err < 0)
> +            return err;
> +    } else {
> +        err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_xfade, 1);
> +        if (err < 0)
> +            goto fail;

The different error cases look odd.  (I think both can be return, since nothing has been made yet.)

> +    }
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
> +                     "command queue %d.\n", cle);
> +
> +    switch (ctx->transition) {
> +    case CUSTOM:     kernel_name = ctx->kernel_name; break;
> +    case FADE:       kernel_name = "fade";           break;
> +    case WIPELEFT:   kernel_name = "wipeleft";       break;
> +    case WIPERIGHT:  kernel_name = "wiperight";      break;
> +    case WIPEUP:     kernel_name = "wipeup";         break;
> +    case WIPEDOWN:   kernel_name = "wipedown";       break;
> +    case SLIDELEFT:  kernel_name = "slideleft";      break;
> +    case SLIDERIGHT: kernel_name = "slideright";     break;
> +    case SLIDEUP:    kernel_name = "slideup";        break;
> +    case SLIDEDOWN:  kernel_name = "slidedown";      break;
> +    default:
> +        err = AVERROR_BUG;
> +        goto fail;
> +    }
> +
> +    ctx->kernel = clCreateKernel(ctx->ocf.program, kernel_name, &cle);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create kernel %d.\n", cle);
> +
> +    ctx->initialised = 1;
> +
> +    return 0;
> +
> +fail:
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->kernel)
> +        clReleaseKernel(ctx->kernel);
> +    return err;
> +}
> +
> +static int xfade_frame(AVFilterContext *avctx, AVFrame *a, AVFrame *b)
> +{
> +    AVFilterLink *outlink = avctx->outputs[0];
> +    XFadeOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output;
> +    cl_int cle;
> +    cl_float progress = av_clipf(1.f - ((cl_float)(ctx->pts - ctx->first_pts - ctx->offset_pts) / ctx->duration_pts), 0.f, 1.f);
> +    size_t global_work[2];
> +    int kernel_arg = 0;
> +    int err, plane;
> +
> +    if (!ctx->initialised) {
> +        AVHWFramesContext *main_fc =
> +            (AVHWFramesContext*)a->hw_frames_ctx->data;
> +        AVHWFramesContext *xfade_fc =
> +            (AVHWFramesContext*)b->hw_frames_ctx->data;
> +
> +        err = xfade_opencl_load(avctx, main_fc->sw_format,
> +                                xfade_fc->sw_format);
> +        if (err < 0)
> +            return err;
> +    }
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    for (plane = 0; plane < ctx->nb_planes; plane++) {
> +        cl_mem mem;
> +        kernel_arg = 0;
> +
> +        mem = (cl_mem)output->data[plane];
> +        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_mem, &mem);
> +        kernel_arg++;
> +
> +        mem = (cl_mem)ctx->xf[0]->data[plane];
> +        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_mem, &mem);
> +        kernel_arg++;
> +
> +        mem = (cl_mem)ctx->xf[1]->data[plane];
> +        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_mem, &mem);
> +        kernel_arg++;
> +
> +        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_float, &progress);
> +        kernel_arg++;
> +
> +        err = ff_opencl_filter_work_size_from_image(avctx, global_work,
> +                                                    output, plane, 0);
> +        if (err < 0)
> +            goto fail;
> +
> +        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->kernel, 2, NULL,
> +                                     global_work, NULL, 0, NULL, NULL);
> +        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue xfade kernel "
> +                         "for plane %d: %d.\n", plane, cle);
> +    }
> +
> +    cle = clFinish(ctx->command_queue);
> +    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish command queue: %d.\n", cle);
> +
> +    err = av_frame_copy_props(output, ctx->xf[0]);
> +    if (err < 0)
> +        goto fail;
> +
> +    output->pts = ctx->pts;
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +static int xfade_opencl_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *avctx = outlink->src;
> +    XFadeOpenCLContext *ctx = avctx->priv;
> +    AVFilterLink *mainlink = avctx->inputs[0];
> +    int err;
> +
> +    err = ff_opencl_filter_config_output(outlink);
> +    if (err < 0)
> +        return err;
> +
> +    ctx->first_pts = ctx->pts = AV_NOPTS_VALUE;
> +
> +    outlink->w = mainlink->w;
> +    outlink->h = mainlink->h;

These are already set by ff_opencl_filter_config_output() using its default behaviour of taking the properties from the first input.

> +    outlink->time_base = mainlink->time_base;
> +    outlink->sample_aspect_ratio = mainlink->sample_aspect_ratio;
> +    outlink->frame_rate = mainlink->frame_rate;
> +
> +    if (ctx->duration)
> +        ctx->duration_pts = av_rescale_q(ctx->duration, AV_TIME_BASE_Q, outlink->time_base);
> +    if (ctx->offset)
> +        ctx->offset_pts = av_rescale_q(ctx->offset, AV_TIME_BASE_Q, outlink->time_base);
> +
> +    return 0;
> +}
> +
> +static int xfade_opencl_activate(AVFilterContext *avctx)
> +{
> +    XFadeOpenCLContext *ctx = avctx->priv;
> +    AVFilterLink *outlink = avctx->outputs[0];
> +    AVFrame *in = NULL;
> +    int ret = 0, status;
> +    int64_t pts;
> +
> +    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
> +
> +    if (ctx->xfade_is_over) {
> +        ret = ff_inlink_consume_frame(avctx->inputs[1], &in);
> +        if (ret < 0) {
> +            return ret;
> +        } else if (ff_inlink_acknowledge_status(avctx->inputs[1], &status, &pts)) {
> +            ff_outlink_set_status(outlink, status, ctx->pts);
> +            return 0;
> +        } else if (!ret) {
> +            if (ff_outlink_frame_wanted(outlink)) {
> +                ff_inlink_request_frame(avctx->inputs[1]);
> +                return 0;
> +            }
> +        } else {
> +            in->pts = ctx->pts;
> +            ctx->pts += av_rescale_q(1, av_inv_q(outlink->frame_rate), outlink->time_base);
> +            return ff_filter_frame(outlink, in);
> +        }
> +    }
> +
> +    if (ff_inlink_queued_frames(avctx->inputs[0]) > 0) {
> +        ctx->xf[0] = ff_inlink_peek_frame(avctx->inputs[0], 0);
> +        if (ctx->xf[0]) {
> +            if (ctx->first_pts == AV_NOPTS_VALUE) {
> +                ctx->first_pts = ctx->xf[0]->pts;
> +            }
> +            ctx->pts = ctx->xf[0]->pts;
> +            if (ctx->first_pts + ctx->offset_pts > ctx->xf[0]->pts) {
> +                ctx->xf[0] = NULL;
> +                ctx->need_second = 0;
> +                ff_inlink_consume_frame(avctx->inputs[0], &in);
> +                return ff_filter_frame(outlink, in);
> +            }
> +
> +            ctx->need_second = 1;
> +        }
> +    }
> +
> +    if (ctx->xf[0] && ff_inlink_queued_frames(avctx->inputs[1]) > 0) {
> +        ff_inlink_consume_frame(avctx->inputs[0], &ctx->xf[0]);
> +        ff_inlink_consume_frame(avctx->inputs[1], &ctx->xf[1]);
> +
> +        ctx->pts = ctx->xf[0]->pts;
> +        if (ctx->xf[0]->pts - (ctx->first_pts + ctx->offset_pts) > ctx->duration_pts)
> +            ctx->xfade_is_over = 1;
> +        ret = xfade_frame(avctx, ctx->xf[0], ctx->xf[1]);
> +        av_frame_free(&ctx->xf[0]);
> +        av_frame_free(&ctx->xf[1]);
> +        return ret;
> +    }
> +
> +    if (ff_inlink_queued_frames(avctx->inputs[0]) > 0 &&
> +        ff_inlink_queued_frames(avctx->inputs[1]) > 0) {
> +        ff_filter_set_ready(avctx, 100);
> +        return 0;
> +    }
> +
> +    if (ff_outlink_frame_wanted(outlink)) {
> +        if (!ctx->eof[0] && ff_outlink_get_status(avctx->inputs[0])) {
> +            ctx->eof[0] = 1;
> +            ctx->xfade_is_over = 1;
> +        }
> +        if (!ctx->eof[1] && ff_outlink_get_status(avctx->inputs[1])) {
> +            ctx->eof[1] = 1;
> +        }
> +        if (!ctx->eof[0] && !ctx->xf[0])
> +            ff_inlink_request_frame(avctx->inputs[0]);
> +        if (!ctx->eof[1] && (ctx->need_second || ctx->eof[0]))
> +            ff_inlink_request_frame(avctx->inputs[1]);
> +        if (ctx->eof[0] && ctx->eof[1] && (
> +            ff_inlink_queued_frames(avctx->inputs[0]) <= 0 ||
> +            ff_inlink_queued_frames(avctx->inputs[1]) <= 0))
> +            ff_outlink_set_status(outlink, AVERROR_EOF, AV_NOPTS_VALUE);
> +        return 0;
> +    }
> +
> +    return FFERROR_NOT_READY;
> +}
> +
> +static av_cold void xfade_opencl_uninit(AVFilterContext *avctx)
> +{
> +    XFadeOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    if (ctx->kernel) {
> +        cle = clReleaseKernel(ctx->kernel);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "kernel: %d.\n", cle);
> +    }
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}
> +
> +static AVFrame *get_video_buffer(AVFilterLink *inlink, int w, int h)
> +{
> +    XFadeOpenCLContext *s = inlink->dst->priv;
> +
> +    return s->xfade_is_over || !s->need_second ?
> +        ff_null_get_video_buffer   (inlink, w, h) :
> +        ff_default_get_video_buffer(inlink, w, h);
> +}
> +
> +#define OFFSET(x) offsetof(XFadeOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +
> +static const AVOption xfade_opencl_options[] = {
> +    { "transition", "set cross fade transition", OFFSET(transition), AV_OPT_TYPE_INT, {.i64=1}, 0, NB_TRANSITIONS-1, FLAGS, "transition" },
> +    {   "custom",    "custom transition",     0, AV_OPT_TYPE_CONST, {.i64=CUSTOM},    0, 0, FLAGS, "transition" },
> +    {   "fade",      "fade transition",       0, AV_OPT_TYPE_CONST, {.i64=FADE},      0, 0, FLAGS, "transition" },
> +    {   "wipeleft",  "wipe left transition",  0, AV_OPT_TYPE_CONST, {.i64=WIPELEFT},  0, 0, FLAGS, "transition" },
> +    {   "wiperight", "wipe right transition", 0, AV_OPT_TYPE_CONST, {.i64=WIPERIGHT}, 0, 0, FLAGS, "transition" },
> +    {   "wipeup",    "wipe up transition",    0, AV_OPT_TYPE_CONST, {.i64=WIPEUP},    0, 0, FLAGS, "transition" },
> +    {   "wipedown",  "wipe down transition",  0, AV_OPT_TYPE_CONST, {.i64=WIPEDOWN},  0, 0, FLAGS, "transition" },
> +    {   "slideleft",  "slide left transition",  0, AV_OPT_TYPE_CONST, {.i64=SLIDELEFT},  0, 0, FLAGS, "transition" },
> +    {   "slideright", "slide right transition", 0, AV_OPT_TYPE_CONST, {.i64=SLIDERIGHT}, 0, 0, FLAGS, "transition" },
> +    {   "slideup",    "slide up transition",    0, AV_OPT_TYPE_CONST, {.i64=SLIDEUP},    0, 0, FLAGS, "transition" },
> +    {   "slidedown",  "slide down transition",  0, AV_OPT_TYPE_CONST, {.i64=SLIDEDOWN},  0, 0, FLAGS, "transition" },
> +    { "source", "set OpenCL program source file for custom transition", OFFSET(source_file), AV_OPT_TYPE_STRING, {.str = NULL}, .flags = FLAGS },
> +    { "kernel", "set kernel name in program file for custom transition", OFFSET(kernel_name), AV_OPT_TYPE_STRING, {.str = NULL}, .flags = FLAGS },
> +    { "duration", "set cross fade duration", OFFSET(duration), AV_OPT_TYPE_DURATION, {.i64=1000000}, 0, 60000000, FLAGS },
> +    { "offset",   "set cross fade start relative to first input stream", OFFSET(offset), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 60000000, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(xfade_opencl);
> +
> +static const AVFilterPad xfade_opencl_inputs[] = {
> +    {
> +        .name             = "main",
> +        .type             = AVMEDIA_TYPE_VIDEO,
> +        .get_video_buffer = get_video_buffer,
> +        .config_props     = &ff_opencl_filter_config_input,
> +    },
> +    {
> +        .name             = "xfade",
> +        .type             = AVMEDIA_TYPE_VIDEO,
> +        .get_video_buffer = get_video_buffer,
> +        .config_props     = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad xfade_opencl_outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = &xfade_opencl_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_xfade_opencl = {
> +    .name            = "xfade_opencl",
> +    .description     = NULL_IF_CONFIG_SMALL("Cross fade one video with another video."),
> +    .priv_size       = sizeof(XFadeOpenCLContext),
> +    .priv_class      = &xfade_opencl_class,
> +    .init            = &ff_opencl_filter_init,
> +    .uninit          = &xfade_opencl_uninit,
> +    .query_formats   = &ff_opencl_filter_query_formats,
> +    .activate        = &xfade_opencl_activate,
> +    .inputs          = xfade_opencl_inputs,
> +    .outputs         = xfade_opencl_outputs,
> +    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> 

Thanks,

- Mark

diff --git a/configure b/configure
index 1f3d0fdd4b..7da9f486b8 100755
--- a/configure
+++ b/configure
@@ -3595,6 +3595,7 @@  zscale_filter_deps="libzimg const_nan"
 scale_vaapi_filter_deps="vaapi"
 vpp_qsv_filter_deps="libmfx"
 vpp_qsv_filter_select="qsvvpp"
+xfade_opencl_filter_deps="opencl"
 yadif_cuda_filter_deps="ffnvcodec"
 yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
diff --git a/doc/filters.texi b/doc/filters.texi
index a9ae75f0c0..1b787c51d7 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -21341,6 +21341,103 @@  Apply a strong blur of both luma and chroma parameters:
 @end example
 @end itemize
 
+@section xfade_opencl
+
+Cross fade two videos with custom transition effect by using OpenCL.
+
+It accepts the following options:
+
+@table @option
+@item transition
+Set one of possible transition effects.
+
+@table @option
+@item custom
+Select custom transition effect, the actual transition description
+will be picked from source and kernel options.
+
+@item fade
+@item wipeleft
+@item wiperight
+@item wipeup
+@item wipedown
+@item slideleft
+@item slideright
+@item slideup
+@item slidedown
+
+Default transtition is fade.
+@end table
+
+@item source
+OpenCL program source file for custom transition.
+
+@item kernel
+Set name of kernel to use for custom transition from program source file.
+
+@item duration
+Set duration of video transition.
+
+@item offset
+Set time of start of transition relative to first video.
+@end table
+
+The program source file must contain a kernel function with the given name,
+which will be run once for each plane of the output.  Each run on a plane
+gets enqueued as a separate 2D global NDRange with one work-item for each
+pixel to be generated.  The global ID offset for each work-item is therefore
+the coordinates of a pixel in the destination image.
+
+The kernel function needs to take the following arguments:
+@itemize
+@item
+Destination image, @var{__write_only image2d_t}.
+
+This image will become the output; the kernel should write all of it.
+
+@item
+First Source image, @var{__read_only image2d_t}.
+Second Source image, @var{__read_only image2d_t}.
+
+These are the most recent images on each input.  The kernel may read from
+them to generate the output, but they can't be written to.
+
+@item
+Transition progress, @var{float}. This value is always between 0 and 1 inclusive.
+@end itemize
+
+Example programs:
+
+@itemize
+@item
+Apply dots curtain transition effect:
+@verbatim
+__kernel void blend_images(__write_only image2d_t dst,
+                           __read_only  image2d_t src1,
+                           __read_only  image2d_t src2,
+                           float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_LINEAR);
+    int2  p = (int2)(get_global_id(0), get_global_id(1));
+    float2 rp = (float2)(get_global_id(0), get_global_id(1));
+    float2 dim = (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
+    rp = rp / dim;
+
+    float2 dots = (float2)(20.0, 20.0);
+    float2 center = (float2)(0,0);
+    float2 unused;
+
+    float4 val1 = read_imagef(src1, sampler, p);
+    float4 val2 = read_imagef(src2, sampler, p);
+    bool next = distance(fract(rp * dots, &unused), (float2)(0.5, 0.5)) < (progress / distance(rp, center));
+
+    write_imagef(dst, p, next ? val1 : val2);
+}
+@end verbatim
+
+@end itemize
+
 @c man end OPENCL VIDEO FILTERS
 
 @chapter VAAPI Video Filters
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 58b3077dec..a5ee9c8e88 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -441,6 +441,7 @@  OBJS-$(CONFIG_W3FDIF_FILTER)                 += vf_w3fdif.o
 OBJS-$(CONFIG_WAVEFORM_FILTER)               += vf_waveform.o
 OBJS-$(CONFIG_WEAVE_FILTER)                  += vf_weave.o
 OBJS-$(CONFIG_XBR_FILTER)                    += vf_xbr.o
+OBJS-$(CONFIG_XFADE_OPENCL_FILTER)           += vf_xfade_opencl.o opencl.o opencl/xfade.o
 OBJS-$(CONFIG_XMEDIAN_FILTER)                += vf_xmedian.o framesync.o
 OBJS-$(CONFIG_XSTACK_FILTER)                 += vf_stack.o framesync.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += vf_yadif.o yadif_common.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 6270c18ae2..8a7eac3757 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -420,6 +420,7 @@  extern AVFilter ff_vf_w3fdif;
 extern AVFilter ff_vf_waveform;
 extern AVFilter ff_vf_weave;
 extern AVFilter ff_vf_xbr;
+extern AVFilter ff_vf_xfade_opencl;
 extern AVFilter ff_vf_xmedian;
 extern AVFilter ff_vf_xstack;
 extern AVFilter ff_vf_yadif;
diff --git a/libavfilter/opencl/xfade.cl b/libavfilter/opencl/xfade.cl
new file mode 100644
index 0000000000..9b5bdb5e29
--- /dev/null
+++ b/libavfilter/opencl/xfade.cl
@@ -0,0 +1,150 @@ 
+__kernel void fade(__write_only image2d_t dst,
+                   __read_only  image2d_t src1,
+                   __read_only  image2d_t src2,
+                   float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+    int2  p = (int2)(get_global_id(0), get_global_id(1));
+
+    float4 val1 = read_imagef(src1, sampler, p);
+    float4 val2 = read_imagef(src2, sampler, p);
+
+    write_imagef(dst, p, val1 * progress + val2 * (1.f - progress));
+}
+
+__kernel void wipeleft(__write_only image2d_t dst,
+                       __read_only  image2d_t src1,
+                       __read_only  image2d_t src2,
+                       float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+    int2  p = (int2)(get_global_id(0), get_global_id(1));
+    int   s = (int)(get_image_dim(src1).x * progress);
+
+    float4 val1 = read_imagef(src1, sampler, p);
+    float4 val2 = read_imagef(src2, sampler, p);
+
+    write_imagef(dst, p, p.x > s ? val2 : val1);
+}
+
+__kernel void wiperight(__write_only image2d_t dst,
+                        __read_only  image2d_t src1,
+                        __read_only  image2d_t src2,
+                        float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+    int2  p = (int2)(get_global_id(0), get_global_id(1));
+    int   s = (int)(get_image_dim(src1).x * (1.f - progress));
+
+    float4 val1 = read_imagef(src1, sampler, p);
+    float4 val2 = read_imagef(src2, sampler, p);
+
+    write_imagef(dst, p, p.x > s ? val1 : val2);
+}
+
+__kernel void wipeup(__write_only image2d_t dst,
+                     __read_only  image2d_t src1,
+                     __read_only  image2d_t src2,
+                     float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+    int2  p = (int2)(get_global_id(0), get_global_id(1));
+    int   s = (int)(get_image_dim(src1).y * progress);
+
+    float4 val1 = read_imagef(src1, sampler, p);
+    float4 val2 = read_imagef(src2, sampler, p);
+
+    write_imagef(dst, p, p.y > s ? val2 : val1);
+}
+
+__kernel void wipedown(__write_only image2d_t dst,
+                       __read_only  image2d_t src1,
+                       __read_only  image2d_t src2,
+                       float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
+                               CLK_FILTER_NEAREST);
+    int2  p = (int2)(get_global_id(0), get_global_id(1));
+    int   s = (int)(get_image_dim(src1).y * (1.f - progress));
+
+    float4 val1 = read_imagef(src1, sampler, p);
+    float4 val2 = read_imagef(src2, sampler, p);
+
+    write_imagef(dst, p, p.y > s ? val1 : val2);
+}
+
+__kernel void slidedown(__write_only image2d_t dst,
+                        __read_only  image2d_t src1,
+                        __read_only  image2d_t src2,
+                        float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
+                               CLK_FILTER_LINEAR);
+    float2 direction = (float2)(0.0, 1.0);
+    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
+    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
+    float2 p = uv + progress * sign(direction);
+    float2 unused;
+    float2 f = fract(p, &unused);
+    float4 val1 = read_imagef(src1, sampler, f);
+    float4 val2 = read_imagef(src2, sampler, f);
+    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
+}
+
+__kernel void slideup(__write_only image2d_t dst,
+                      __read_only  image2d_t src1,
+                      __read_only  image2d_t src2,
+                      float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
+                               CLK_FILTER_LINEAR);
+    float2 direction = (float2)(0.0, -1.0);
+    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
+    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
+    float2 p = uv + progress * sign(direction);
+    float2 unused;
+    float2 f = fract(p, &unused);
+    float4 val1 = read_imagef(src1, sampler, f);
+    float4 val2 = read_imagef(src2, sampler, f);
+    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
+}
+
+__kernel void slideleft(__write_only image2d_t dst,
+                        __read_only  image2d_t src1,
+                        __read_only  image2d_t src2,
+                        float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
+                               CLK_FILTER_LINEAR);
+    float2 direction = (float2)(-1.0, 0.0);
+    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
+    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
+    float2 p = uv + progress * sign(direction);
+    float2 unused;
+    float2 f = fract(p, &unused);
+    float4 val1 = read_imagef(src1, sampler, f);
+    float4 val2 = read_imagef(src2, sampler, f);
+    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
+}
+
+__kernel void slideright(__write_only image2d_t dst,
+                         __read_only  image2d_t src1,
+                         __read_only  image2d_t src2,
+                         float progress)
+{
+    const sampler_t sampler = (CLK_NORMALIZED_COORDS_TRUE |
+                               CLK_FILTER_LINEAR);
+    float2 direction = (float2)(1.0, 0.0);
+    float2 uv = (float2)(get_global_id(0), get_global_id(1)) / (float2)(get_image_dim(src1).x, get_image_dim(src1).y);
+    int2 uvi = (int2)(get_global_id(0), get_global_id(1));
+    float2 p = uv + progress * sign(direction);
+    float2 unused;
+    float2 f = fract(p, &unused);
+    float4 val1 = read_imagef(src1, sampler, f);
+    float4 val2 = read_imagef(src2, sampler, f);
+    write_imagef(dst, uvi, mix(val1, val2, step(0.f, p.y) * step(p.y, 1.f) * step(0.f, p.x) * step(p.x, 1.f)));
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 225e7a49ea..4e262672ad 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -30,5 +30,6 @@  extern const char *ff_opencl_source_overlay;
 extern const char *ff_opencl_source_tonemap;
 extern const char *ff_opencl_source_transpose;
 extern const char *ff_opencl_source_unsharp;
+extern const char *ff_opencl_source_xfade;
 
 #endif /* AVFILTER_OPENCL_SOURCE_H */
diff --git a/libavfilter/vf_xfade_opencl.c b/libavfilter/vf_xfade_opencl.c
new file mode 100644
index 0000000000..83a45ec4bd
--- /dev/null
+++ b/libavfilter/vf_xfade_opencl.c
@@ -0,0 +1,427 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/log.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "framesync.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "video.h"
+
+enum XFadeTransitions {
+    CUSTOM,
+    FADE,
+    WIPELEFT,
+    WIPERIGHT,
+    WIPEUP,
+    WIPEDOWN,
+    SLIDELEFT,
+    SLIDERIGHT,
+    SLIDEUP,
+    SLIDEDOWN,
+    NB_TRANSITIONS,
+};
+
+typedef struct XFadeOpenCLContext {
+    OpenCLFilterContext ocf;
+
+    int              transition;
+    const char      *source_file;
+    const char      *kernel_name;
+    int64_t          duration;
+    int64_t          offset;
+
+    int              initialised;
+    cl_kernel        kernel;
+    cl_command_queue command_queue;
+
+    int              nb_planes;
+
+    int64_t          duration_pts;
+    int64_t          offset_pts;
+    int64_t          first_pts;
+    int64_t          pts;
+    int              xfade_is_over;
+    int              need_second;
+    int              eof[2];
+    AVFrame         *xf[2];
+} XFadeOpenCLContext;
+
+static int xfade_opencl_load(AVFilterContext *avctx,
+                             enum AVPixelFormat main_format,
+                             enum AVPixelFormat xfade_format)
+{
+    XFadeOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+    const AVPixFmtDescriptor *main_desc, *xfade_desc;
+    int err, i, main_planes, xfade_planes;
+    const char *kernel_name;
+
+    ctx->ocf.output_width  = avctx->inputs[0]->w;
+    ctx->ocf.output_height = avctx->inputs[0]->h;
+    ctx->ocf.output_format = avctx->inputs[0]->format;
+
+    main_desc  = av_pix_fmt_desc_get(main_format);
+    xfade_desc = av_pix_fmt_desc_get(xfade_format);
+
+    main_planes = xfade_planes = 0;
+    for (i = 0; i < main_desc->nb_components; i++)
+        main_planes = FFMAX(main_planes,
+                            main_desc->comp[i].plane + 1);
+    for (i = 0; i < xfade_desc->nb_components; i++)
+        xfade_planes = FFMAX(xfade_planes,
+                             xfade_desc->comp[i].plane + 1);
+
+    ctx->nb_planes = main_planes;
+
+    if (ctx->transition == CUSTOM) {
+        err = ff_opencl_filter_load_program_from_file(avctx, ctx->source_file);
+        if (err < 0)
+            return err;
+    } else {
+        err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_xfade, 1);
+        if (err < 0)
+            goto fail;
+    }
+
+    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+                                              ctx->ocf.hwctx->device_id,
+                                              0, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
+                     "command queue %d.\n", cle);
+
+    switch (ctx->transition) {
+    case CUSTOM:     kernel_name = ctx->kernel_name; break;
+    case FADE:       kernel_name = "fade";           break;
+    case WIPELEFT:   kernel_name = "wipeleft";       break;
+    case WIPERIGHT:  kernel_name = "wiperight";      break;
+    case WIPEUP:     kernel_name = "wipeup";         break;
+    case WIPEDOWN:   kernel_name = "wipedown";       break;
+    case SLIDELEFT:  kernel_name = "slideleft";      break;
+    case SLIDERIGHT: kernel_name = "slideright";     break;
+    case SLIDEUP:    kernel_name = "slideup";        break;
+    case SLIDEDOWN:  kernel_name = "slidedown";      break;
+    default:
+        err = AVERROR_BUG;
+        goto fail;
+    }
+
+    ctx->kernel = clCreateKernel(ctx->ocf.program, kernel_name, &cle);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create kernel %d.\n", cle);
+
+    ctx->initialised = 1;
+
+    return 0;
+
+fail:
+    if (ctx->command_queue)
+        clReleaseCommandQueue(ctx->command_queue);
+    if (ctx->kernel)
+        clReleaseKernel(ctx->kernel);
+    return err;
+}
+
+static int xfade_frame(AVFilterContext *avctx, AVFrame *a, AVFrame *b)
+{
+    AVFilterLink *outlink = avctx->outputs[0];
+    XFadeOpenCLContext *ctx = avctx->priv;
+    AVFrame *output;
+    cl_int cle;
+    cl_float progress = av_clipf(1.f - ((cl_float)(ctx->pts - ctx->first_pts - ctx->offset_pts) / ctx->duration_pts), 0.f, 1.f);
+    size_t global_work[2];
+    int kernel_arg = 0;
+    int err, plane;
+
+    if (!ctx->initialised) {
+        AVHWFramesContext *main_fc =
+            (AVHWFramesContext*)a->hw_frames_ctx->data;
+        AVHWFramesContext *xfade_fc =
+            (AVHWFramesContext*)b->hw_frames_ctx->data;
+
+        err = xfade_opencl_load(avctx, main_fc->sw_format,
+                                xfade_fc->sw_format);
+        if (err < 0)
+            return err;
+    }
+
+    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+    if (!output) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (plane = 0; plane < ctx->nb_planes; plane++) {
+        cl_mem mem;
+        kernel_arg = 0;
+
+        mem = (cl_mem)output->data[plane];
+        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_mem, &mem);
+        kernel_arg++;
+
+        mem = (cl_mem)ctx->xf[0]->data[plane];
+        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_mem, &mem);
+        kernel_arg++;
+
+        mem = (cl_mem)ctx->xf[1]->data[plane];
+        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_mem, &mem);
+        kernel_arg++;
+
+        CL_SET_KERNEL_ARG(ctx->kernel, kernel_arg, cl_float, &progress);
+        kernel_arg++;
+
+        err = ff_opencl_filter_work_size_from_image(avctx, global_work,
+                                                    output, plane, 0);
+        if (err < 0)
+            goto fail;
+
+        cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->kernel, 2, NULL,
+                                     global_work, NULL, 0, NULL, NULL);
+        CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue xfade kernel "
+                         "for plane %d: %d.\n", plane, cle);
+    }
+
+    cle = clFinish(ctx->command_queue);
+    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish command queue: %d.\n", cle);
+
+    err = av_frame_copy_props(output, ctx->xf[0]);
+    if (err < 0)
+        goto fail;
+
+    output->pts = ctx->pts;
+
+    return ff_filter_frame(outlink, output);
+
+fail:
+    av_frame_free(&output);
+    return err;
+}
+
+static int xfade_opencl_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *avctx = outlink->src;
+    XFadeOpenCLContext *ctx = avctx->priv;
+    AVFilterLink *mainlink = avctx->inputs[0];
+    int err;
+
+    err = ff_opencl_filter_config_output(outlink);
+    if (err < 0)
+        return err;
+
+    ctx->first_pts = ctx->pts = AV_NOPTS_VALUE;
+
+    outlink->w = mainlink->w;
+    outlink->h = mainlink->h;
+    outlink->time_base = mainlink->time_base;
+    outlink->sample_aspect_ratio = mainlink->sample_aspect_ratio;
+    outlink->frame_rate = mainlink->frame_rate;
+
+    if (ctx->duration)
+        ctx->duration_pts = av_rescale_q(ctx->duration, AV_TIME_BASE_Q, outlink->time_base);
+    if (ctx->offset)
+        ctx->offset_pts = av_rescale_q(ctx->offset, AV_TIME_BASE_Q, outlink->time_base);
+
+    return 0;
+}
+
+static int xfade_opencl_activate(AVFilterContext *avctx)
+{
+    XFadeOpenCLContext *ctx = avctx->priv;
+    AVFilterLink *outlink = avctx->outputs[0];
+    AVFrame *in = NULL;
+    int ret = 0, status;
+    int64_t pts;
+
+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
+
+    if (ctx->xfade_is_over) {
+        ret = ff_inlink_consume_frame(avctx->inputs[1], &in);
+        if (ret < 0) {
+            return ret;
+        } else if (ff_inlink_acknowledge_status(avctx->inputs[1], &status, &pts)) {
+            ff_outlink_set_status(outlink, status, ctx->pts);
+            return 0;
+        } else if (!ret) {
+            if (ff_outlink_frame_wanted(outlink)) {
+                ff_inlink_request_frame(avctx->inputs[1]);
+                return 0;
+            }
+        } else {
+            in->pts = ctx->pts;
+            ctx->pts += av_rescale_q(1, av_inv_q(outlink->frame_rate), outlink->time_base);
+            return ff_filter_frame(outlink, in);
+        }
+    }
+
+    if (ff_inlink_queued_frames(avctx->inputs[0]) > 0) {
+        ctx->xf[0] = ff_inlink_peek_frame(avctx->inputs[0], 0);
+        if (ctx->xf[0]) {
+            if (ctx->first_pts == AV_NOPTS_VALUE) {
+                ctx->first_pts = ctx->xf[0]->pts;
+            }
+            ctx->pts = ctx->xf[0]->pts;
+            if (ctx->first_pts + ctx->offset_pts > ctx->xf[0]->pts) {
+                ctx->xf[0] = NULL;
+                ctx->need_second = 0;
+                ff_inlink_consume_frame(avctx->inputs[0], &in);
+                return ff_filter_frame(outlink, in);
+            }
+
+            ctx->need_second = 1;
+        }
+    }
+
+    if (ctx->xf[0] && ff_inlink_queued_frames(avctx->inputs[1]) > 0) {
+        ff_inlink_consume_frame(avctx->inputs[0], &ctx->xf[0]);
+        ff_inlink_consume_frame(avctx->inputs[1], &ctx->xf[1]);
+
+        ctx->pts = ctx->xf[0]->pts;
+        if (ctx->xf[0]->pts - (ctx->first_pts + ctx->offset_pts) > ctx->duration_pts)
+            ctx->xfade_is_over = 1;
+        ret = xfade_frame(avctx, ctx->xf[0], ctx->xf[1]);
+        av_frame_free(&ctx->xf[0]);
+        av_frame_free(&ctx->xf[1]);
+        return ret;
+    }
+
+    if (ff_inlink_queued_frames(avctx->inputs[0]) > 0 &&
+        ff_inlink_queued_frames(avctx->inputs[1]) > 0) {
+        ff_filter_set_ready(avctx, 100);
+        return 0;
+    }
+
+    if (ff_outlink_frame_wanted(outlink)) {
+        if (!ctx->eof[0] && ff_outlink_get_status(avctx->inputs[0])) {
+            ctx->eof[0] = 1;
+            ctx->xfade_is_over = 1;
+        }
+        if (!ctx->eof[1] && ff_outlink_get_status(avctx->inputs[1])) {
+            ctx->eof[1] = 1;
+        }
+        if (!ctx->eof[0] && !ctx->xf[0])
+            ff_inlink_request_frame(avctx->inputs[0]);
+        if (!ctx->eof[1] && (ctx->need_second || ctx->eof[0]))
+            ff_inlink_request_frame(avctx->inputs[1]);
+        if (ctx->eof[0] && ctx->eof[1] && (
+            ff_inlink_queued_frames(avctx->inputs[0]) <= 0 ||
+            ff_inlink_queued_frames(avctx->inputs[1]) <= 0))
+            ff_outlink_set_status(outlink, AVERROR_EOF, AV_NOPTS_VALUE);
+        return 0;
+    }
+
+    return FFERROR_NOT_READY;
+}
+
+static av_cold void xfade_opencl_uninit(AVFilterContext *avctx)
+{
+    XFadeOpenCLContext *ctx = avctx->priv;
+    cl_int cle;
+
+    if (ctx->kernel) {
+        cle = clReleaseKernel(ctx->kernel);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "kernel: %d.\n", cle);
+    }
+
+    if (ctx->command_queue) {
+        cle = clReleaseCommandQueue(ctx->command_queue);
+        if (cle != CL_SUCCESS)
+            av_log(avctx, AV_LOG_ERROR, "Failed to release "
+                   "command queue: %d.\n", cle);
+    }
+
+    ff_opencl_filter_uninit(avctx);
+}
+
+static AVFrame *get_video_buffer(AVFilterLink *inlink, int w, int h)
+{
+    XFadeOpenCLContext *s = inlink->dst->priv;
+
+    return s->xfade_is_over || !s->need_second ?
+        ff_null_get_video_buffer   (inlink, w, h) :
+        ff_default_get_video_buffer(inlink, w, h);
+}
+
+#define OFFSET(x) offsetof(XFadeOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption xfade_opencl_options[] = {
+    { "transition", "set cross fade transition", OFFSET(transition), AV_OPT_TYPE_INT, {.i64=1}, 0, NB_TRANSITIONS-1, FLAGS, "transition" },
+    {   "custom",    "custom transition",     0, AV_OPT_TYPE_CONST, {.i64=CUSTOM},    0, 0, FLAGS, "transition" },
+    {   "fade",      "fade transition",       0, AV_OPT_TYPE_CONST, {.i64=FADE},      0, 0, FLAGS, "transition" },
+    {   "wipeleft",  "wipe left transition",  0, AV_OPT_TYPE_CONST, {.i64=WIPELEFT},  0, 0, FLAGS, "transition" },
+    {   "wiperight", "wipe right transition", 0, AV_OPT_TYPE_CONST, {.i64=WIPERIGHT}, 0, 0, FLAGS, "transition" },
+    {   "wipeup",    "wipe up transition",    0, AV_OPT_TYPE_CONST, {.i64=WIPEUP},    0, 0, FLAGS, "transition" },
+    {   "wipedown",  "wipe down transition",  0, AV_OPT_TYPE_CONST, {.i64=WIPEDOWN},  0, 0, FLAGS, "transition" },
+    {   "slideleft",  "slide left transition",  0, AV_OPT_TYPE_CONST, {.i64=SLIDELEFT},  0, 0, FLAGS, "transition" },
+    {   "slideright", "slide right transition", 0, AV_OPT_TYPE_CONST, {.i64=SLIDERIGHT}, 0, 0, FLAGS, "transition" },
+    {   "slideup",    "slide up transition",    0, AV_OPT_TYPE_CONST, {.i64=SLIDEUP},    0, 0, FLAGS, "transition" },
+    {   "slidedown",  "slide down transition",  0, AV_OPT_TYPE_CONST, {.i64=SLIDEDOWN},  0, 0, FLAGS, "transition" },
+    { "source", "set OpenCL program source file for custom transition", OFFSET(source_file), AV_OPT_TYPE_STRING, {.str = NULL}, .flags = FLAGS },
+    { "kernel", "set kernel name in program file for custom transition", OFFSET(kernel_name), AV_OPT_TYPE_STRING, {.str = NULL}, .flags = FLAGS },
+    { "duration", "set cross fade duration", OFFSET(duration), AV_OPT_TYPE_DURATION, {.i64=1000000}, 0, 60000000, FLAGS },
+    { "offset",   "set cross fade start relative to first input stream", OFFSET(offset), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 60000000, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(xfade_opencl);
+
+static const AVFilterPad xfade_opencl_inputs[] = {
+    {
+        .name             = "main",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .get_video_buffer = get_video_buffer,
+        .config_props     = &ff_opencl_filter_config_input,
+    },
+    {
+        .name             = "xfade",
+        .type             = AVMEDIA_TYPE_VIDEO,
+        .get_video_buffer = get_video_buffer,
+        .config_props     = &ff_opencl_filter_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad xfade_opencl_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = &xfade_opencl_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_xfade_opencl = {
+    .name            = "xfade_opencl",
+    .description     = NULL_IF_CONFIG_SMALL("Cross fade one video with another video."),
+    .priv_size       = sizeof(XFadeOpenCLContext),
+    .priv_class      = &xfade_opencl_class,
+    .init            = &ff_opencl_filter_init,
+    .uninit          = &xfade_opencl_uninit,
+    .query_formats   = &ff_opencl_filter_query_formats,
+    .activate        = &xfade_opencl_activate,
+    .inputs          = xfade_opencl_inputs,
+    .outputs         = xfade_opencl_outputs,
+    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
+};

[FFmpeg-devel] avfilter: add xfade opencl filter

Checks

Commit Message

Comments

Patch