diff mbox series

[FFmpeg-devel] avfilter: add format_cuda filter

Message ID 20210611144307.19241-1-timo@rothenpieler.org
State New
Headers show
Series [FFmpeg-devel] avfilter: add format_cuda filter | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Timo Rothenpieler June 11, 2021, 2:43 p.m. UTC
---
 configure                           |   2 +
 doc/filters.texi                    |  46 ++
 libavfilter/Makefile                |   1 +
 libavfilter/allfilters.c            |   1 +
 libavfilter/cuda/vector_helpers.cuh |  14 +-
 libavfilter/version.h               |   2 +-
 libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
 libavfilter/vf_format_cuda.cu       | 849 ++++++++++++++++++++++++++++
 8 files changed, 1345 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/vf_format_cuda.c
 create mode 100644 libavfilter/vf_format_cuda.cu

Comments

Liu Steven June 11, 2021, 3:33 p.m. UTC | #1
> 在 2021年6月11日,22:43,Timo Rothenpieler <timo@rothenpieler.org> 写道:
Hi Timo,
> 
> ---
> configure                           |   2 +
> doc/filters.texi                    |  46 ++
> libavfilter/Makefile                |   1 +
> libavfilter/allfilters.c            |   1 +
> libavfilter/cuda/vector_helpers.cuh |  14 +-
> libavfilter/version.h               |   2 +-
> libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
> libavfilter/vf_format_cuda.cu       | 849 ++++++++++++++++++++++++++++
> 8 files changed, 1345 insertions(+), 3 deletions(-)
> create mode 100644 libavfilter/vf_format_cuda.c
> create mode 100644 libavfilter/vf_format_cuda.cu
> 
> diff --git a/configure b/configure
> index 6bfd98b384..7041c09177 100755
> --- a/configure
> +++ b/configure
> @@ -3078,6 +3078,8 @@ qsvvpp_select="qsv"
> vaapi_encode_deps="vaapi"
> v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
> 
> +format_cuda_filter_deps="ffnvcodec"
> +format_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
> hwupload_cuda_filter_deps="ffnvcodec"
> scale_npp_filter_deps="ffnvcodec libnpp"
> scale_cuda_filter_deps="ffnvcodec"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 78faf767cf..27a0184cb3 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -12324,6 +12324,52 @@ format=pix_fmts=yuv420p|yuv444p|yuv410p
> @end example
> @end itemize
> 
> +@anchor{format_cuda}
> +@section format_cuda
> +
> +Convert the input video to one of the specified pixel formats.
> +Libavfilter will try to pick one that is suitable as input to
> +the next filter.
> +
> +It accepts the following parameters:
> +@table @option
> +
> +@item format
> +The name of the desired output format.
> +If none is specified, the input format will be used.
> +
> +@item passthrough
> +If true, input frames matching the output format will be passed
> +through as-is.
> +If false, every input frame is processed. Frames matching the
> +output format will be copied without data modification. This is
> +the default mode.
> +
> +@end table
> +
> +@subsection Examples
> +
> +@itemize
> +@item
> +Convert the input video to the @var{yuv420p} format.
> +@example
> +format_cuda=yuv420p
maybe need an format for alpha blend, be used in overlay_cuda for colorkey linkly feature.
> +@end example
> +
> +Convert the input video to the @var{yuv420p} format, but pass-through any
> +input that already matches the format without touching the frames at all.
> +@example
> +format_cuda=yuv420p:1
> +@end example
> +
> +With no arguments, every input frame will be copied into a new
> +buffer, with no further processing done. This can be useful to decouple
> +a decoder with a limited buffer pool from a processing chain with deep buffers.
> +@example
> +format_cuda
> +@end example
> +@end itemize
> +
> @anchor{fps}
> @section fps
> 
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index bc81033e3f..1f8331c4f4 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -273,6 +273,7 @@ OBJS-$(CONFIG_FILLBORDERS_FILTER)            += vf_fillborders.o
> OBJS-$(CONFIG_FIND_RECT_FILTER)              += vf_find_rect.o lavfutils.o
> OBJS-$(CONFIG_FLOODFILL_FILTER)              += vf_floodfill.o
> OBJS-$(CONFIG_FORMAT_FILTER)                 += vf_format.o
> +OBJS-$(CONFIG_FORMAT_CUDA_FILTER)            += vf_format_cuda.o vf_format_cuda.ptx.o
> OBJS-$(CONFIG_FPS_FILTER)                    += vf_fps.o
> OBJS-$(CONFIG_FRAMEPACK_FILTER)              += vf_framepack.o
> OBJS-$(CONFIG_FRAMERATE_FILTER)              += vf_framerate.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index c6afef835f..947214dc25 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -258,6 +258,7 @@ extern const AVFilter ff_vf_fillborders;
> extern const AVFilter ff_vf_find_rect;
> extern const AVFilter ff_vf_floodfill;
> extern const AVFilter ff_vf_format;
> +extern const AVFilter ff_vf_format_cuda;
> extern const AVFilter ff_vf_fps;
> extern const AVFilter ff_vf_framepack;
> extern const AVFilter ff_vf_framerate;
> diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh
> index 67332ef030..8ce67ce579 100644
> --- a/libavfilter/cuda/vector_helpers.cuh
> +++ b/libavfilter/cuda/vector_helpers.cuh
> @@ -42,31 +42,41 @@ template<> struct vector_helper<int4>    { typedef float4 ftype; typedef int4 it
> 
> template<typename T, typename V> inline __device__ V to_floatN(const T &a) { return (V)a; }
> template<typename T, typename V> inline __device__ T from_floatN(const V &a) { return (T)a; }
> +template<typename T, typename V> inline __device__ V to_intN(const T &a) { return (V)a; }
> +template<typename T, typename V> inline __device__ T from_intN(const V &a) { return (T)a; }
> 
> #define OPERATORS2(T) \
>     template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y); } \
>     template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y); } \
>     template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b); } \
>     template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b); } \
> +    template<typename V> inline __device__ T operator&(const T &a, V b) { return make_ ## T (a.x & b, a.y & b); } \
> +    template<typename V> inline __device__ T operator|(const T &a, V b) { return make_ ## T (a.x | b, a.y | b); } \
>     template<typename V> inline __device__ T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b); } \
>     template<typename V> inline __device__ T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b); } \
>     template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; return a; } \
>     template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; } \
>     template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; } \
>     template<> inline __device__ float2 to_floatN<T, float2>(const T &a) { return make_float2(a.x, a.y); } \
> -    template<> inline __device__ T from_floatN<T, float2>(const float2 &a) { return make_ ## T(a.x, a.y); }
> +    template<> inline __device__ T from_floatN<T, float2>(const float2 &a) { return make_ ## T(a.x, a.y); } \
> +    template<> inline __device__ int2 to_intN<T, int2>(const T &a) { return make_int2(a.x, a.y); } \
> +    template<> inline __device__ T from_intN<T, int2>(const int2 &a) { return make_ ## T(a.x, a.y); }
> #define OPERATORS4(T) \
>     template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } \
>     template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } \
>     template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b, a.z * b, a.w * b); } \
>     template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b, a.z / b, a.w / b); } \
> +    template<typename V> inline __device__ T operator&(const T &a, V b) { return make_ ## T (a.x & b, a.y & b, a.z & b, a.w & b); } \
> +    template<typename V> inline __device__ T operator|(const T &a, V b) { return make_ ## T (a.x | b, a.y | b, a.z | b, a.w | b); } \
>     template<typename V> inline __device__ T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b, a.z >> b, a.w >> b); } \
>     template<typename V> inline __device__ T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b, a.z << b, a.w << b); } \
>     template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } \
>     template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w; } \
>     template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; a.z = b; a.w = b; } \
>     template<> inline __device__ float4 to_floatN<T, float4>(const T &a) { return make_float4(a.x, a.y, a.z, a.w); } \
> -    template<> inline __device__ T from_floatN<T, float4>(const float4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); }
> +    template<> inline __device__ T from_floatN<T, float4>(const float4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); } \
> +    template<> inline __device__ int4 to_intN<T, int4>(const T &a) { return make_int4(a.x, a.y, a.z, a.w); } \
> +    template<> inline __device__ T from_intN<T, int4>(const int4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); }
> 
> OPERATORS2(int2)
> OPERATORS2(uchar2)
> diff --git a/libavfilter/version.h b/libavfilter/version.h
> index 5052681653..fbb81ef31c 100644
> --- a/libavfilter/version.h
> +++ b/libavfilter/version.h
> @@ -31,7 +31,7 @@
> 
> #define LIBAVFILTER_VERSION_MAJOR   8
> #define LIBAVFILTER_VERSION_MINOR   0
> -#define LIBAVFILTER_VERSION_MICRO 102
> +#define LIBAVFILTER_VERSION_MICRO 103
> 
> 
> #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
> diff --git a/libavfilter/vf_format_cuda.c b/libavfilter/vf_format_cuda.c
> new file mode 100644
> index 0000000000..89f05b1350
> --- /dev/null
> +++ b/libavfilter/vf_format_cuda.c
> @@ -0,0 +1,433 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <float.h>
> +#include <stdio.h>
> +#include <string.h>
> +
> +#include "libavutil/avstring.h"
> +#include "libavutil/common.h"
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#include "libavutil/cuda_check.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_NV12,
> +    AV_PIX_FMT_YUV444P,
> +    AV_PIX_FMT_P010,
> +    AV_PIX_FMT_P016,
> +    AV_PIX_FMT_YUV444P16,
> +};
> +
> +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
> +#define BLOCKX 32
> +#define BLOCKY 16
> +
> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
> +
> +typedef struct CUDAFormatContext {
> +    const AVClass *class;
> +
> +    AVCUDADeviceContext *hwctx;
> +
> +    enum AVPixelFormat in_fmt;
> +    enum AVPixelFormat out_fmt;
> +
> +    AVBufferRef *frames_ctx;
> +    AVFrame     *frame;
> +
> +    AVFrame *tmp_frame;
> +    int passthrough;
> +
> +    /**
> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
> +     */
> +    enum AVPixelFormat format;
> +
> +    CUcontext   cu_ctx;
> +    CUmodule    cu_module;
> +    CUfunction  cu_func_convert;
> +    CUstream    cu_stream;
> +} CUDAFormatContext;
> +
> +static av_cold int cudaformat_init(AVFilterContext *ctx)
> +{
> +    CUDAFormatContext *s = ctx->priv;
> +
> +    s->frame = av_frame_alloc();
> +    if (!s->frame)
> +        return AVERROR(ENOMEM);
> +
> +    s->tmp_frame = av_frame_alloc();
> +    if (!s->tmp_frame)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static av_cold void cudaformat_uninit(AVFilterContext *ctx)
> +{
> +    CUDAFormatContext *s = ctx->priv;
> +
> +    if (s->hwctx && s->cu_module) {
> +        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> +        CUcontext dummy;
> +
> +        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> +        CHECK_CU(cu->cuModuleUnload(s->cu_module));
> +        s->cu_module = NULL;
> +        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    }
> +
> +    av_frame_free(&s->frame);
> +    av_buffer_unref(&s->frames_ctx);
> +    av_frame_free(&s->tmp_frame);
> +}
> +
> +static int cudaformat_query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pixel_formats[] = {
> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
> +    };
> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
> +    if (!pix_fmts)
> +        return AVERROR(ENOMEM);
> +
> +    return ff_set_common_formats(ctx, pix_fmts);
> +}
> +
> +static av_cold int init_hwframe_ctx(CUDAFormatContext *s, AVBufferRef *device_ctx, int width, int height)
> +{
> +    AVBufferRef *out_ref = NULL;
> +    AVHWFramesContext *out_ctx;
> +    int ret;
> +
> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> +    if (!out_ref)
> +        return AVERROR(ENOMEM);
> +    out_ctx = (AVHWFramesContext*)out_ref->data;
> +
> +    out_ctx->format    = AV_PIX_FMT_CUDA;
> +    out_ctx->sw_format = s->out_fmt;
> +    out_ctx->width     = FFALIGN(width,  32);
> +    out_ctx->height    = FFALIGN(height, 32);
> +
> +    ret = av_hwframe_ctx_init(out_ref);
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_frame_unref(s->frame);
> +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
> +    if (ret < 0)
> +        goto fail;
> +
> +    s->frame->width  = width;
> +    s->frame->height = height;
> +
> +    av_buffer_unref(&s->frames_ctx);
> +    s->frames_ctx = out_ref;
> +
> +    return 0;
> +fail:
> +    av_buffer_unref(&out_ref);
> +    return ret;
> +}
> +
> +static int format_is_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +static av_cold int init_processing_chain(AVFilterContext *ctx, int width, int height)
> +{
> +    CUDAFormatContext *s = ctx->priv;
> +
> +    AVHWFramesContext *in_frames_ctx;
> +
> +    enum AVPixelFormat in_format;
> +    enum AVPixelFormat out_format;
> +    int ret;
> +
> +    /* check that we have a hw context */
> +    if (!ctx->inputs[0]->hw_frames_ctx) {
> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
> +        return AVERROR(EINVAL);
> +    }
> +    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> +    in_format     = in_frames_ctx->sw_format;
> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format;
> +
> +    if (!format_is_supported(in_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
> +               av_get_pix_fmt_name(in_format));
> +        return AVERROR(ENOSYS);
> +    }
> +    if (!format_is_supported(out_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
> +               av_get_pix_fmt_name(out_format));
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    s->in_fmt = in_format;
> +    s->out_fmt = out_format;
> +
> +    if (s->passthrough && in_format == out_format) {
> +        s->frames_ctx = av_buffer_ref(ctx->inputs[0]->hw_frames_ctx);
> +        if (!s->frames_ctx)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        s->passthrough = 0;
> +
> +        ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
> +    if (!ctx->outputs[0]->hw_frames_ctx)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static av_cold int cudaformat_config_props(AVFilterLink *outlink)
> +{
> +    AVFilterContext *ctx = outlink->src;
> +    AVFilterLink *inlink = outlink->src->inputs[0];
> +    CUDAFormatContext *s = ctx->priv;
> +    AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
> +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
> +    const char *in_fmt_name, *out_fmt_name;
> +    char buf[64];
> +    int ret;
> +
> +    extern char vf_format_cuda_ptx[];
> +
> +    s->hwctx = device_hwctx;
> +    s->cu_stream = s->hwctx->stream;
> +
> +    ret = init_processing_chain(ctx, inlink->w, inlink->h);
> +    if (ret < 0)
> +        return ret;
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> +    if (ret < 0)
> +        return ret;
> +
> +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_format_cuda_ptx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    in_fmt_name = av_get_pix_fmt_name(s->in_fmt);
> +    out_fmt_name = av_get_pix_fmt_name(s->out_fmt);
> +    snprintf(buf, sizeof(buf), "Convert_%s_%s", in_fmt_name, out_fmt_name);
> +
> +    if (s->in_fmt != s->out_fmt) {
> +        av_log(ctx, AV_LOG_DEBUG, "Loading conversion kernel: %s\n", buf);
> +        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_convert, s->cu_module, buf));
> +        if (ret < 0)
> +            goto fail;
> +    }
> +
> +    outlink->w = inlink->w;
> +    outlink->h = inlink->h;
> +    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> +
> +    av_log(ctx, AV_LOG_VERBOSE, "%s -> %s%s\n",
> +           in_fmt_name, out_fmt_name, s->passthrough ? " (passthrough)" : "");
> +
> +fail:
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    return ret;
> +}
> +
> +static int call_conversion_kernel(AVFilterContext *ctx,
> +                                  AVFrame *out, AVFrame *in)
> +{
> +    CUDAFormatContext *s = ctx->priv;
> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> +    int ret;
> +
> +    CUdeviceptr dst_devptrs[] = { (CUdeviceptr)out->data[0], (CUdeviceptr)out->data[1],
> +                                  (CUdeviceptr)out->data[2], (CUdeviceptr)out->data[3] };
> +    CUdeviceptr src_devptrs[] = { (CUdeviceptr)in->data[0],  (CUdeviceptr)in->data[1],
> +                                  (CUdeviceptr)in->data[2],  (CUdeviceptr)in->data[3] };
> +    void *args[] = { &in->width, &in->height,
> +                     &dst_devptrs[0], &out->linesize[0], &src_devptrs[0], &in->linesize[0],
> +                     &dst_devptrs[1], &out->linesize[1], &src_devptrs[1], &in->linesize[1],
> +                     &dst_devptrs[2], &out->linesize[2], &src_devptrs[2], &in->linesize[2],
> +                     &dst_devptrs[3], &out->linesize[3], &src_devptrs[3], &in->linesize[3] };
> +
> +    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_convert,
> +                                      DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
> +                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int cudaformat_convert(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
> +{
> +    CUDAFormatContext *s = ctx->priv;
> +    AVFilterLink *outlink = ctx->outputs[0];
> +    AVFrame *src = in;
> +    int ret;
> +
> +    if (s->in_fmt != s->out_fmt)
> +        ret = call_conversion_kernel(ctx, s->frame, src);
> +    else
> +        ret = av_hwframe_transfer_data(s->frame, src, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    src = s->frame;
> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    av_frame_move_ref(out, s->frame);
> +    av_frame_move_ref(s->frame, s->tmp_frame);
> +
> +    s->frame->width  = outlink->w;
> +    s->frame->height = outlink->h;
> +
> +    ret = av_frame_copy_props(out, in);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int cudaformat_filter_frame(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext       *ctx = link->dst;
> +    CUDAFormatContext        *s = ctx->priv;
> +    AVFilterLink      *outlink = ctx->outputs[0];
> +    CudaFunctions          *cu = s->hwctx->internal->cuda_dl;
> +
> +    AVFrame *out = NULL;
> +    CUcontext dummy;
> +    int ret = 0;
> +
> +    if (s->passthrough)
> +        return ff_filter_frame(outlink, in);
> +
> +    out = av_frame_alloc();
> +    if (!out) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = cudaformat_convert(ctx, out, in);
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +
> +fail:
> +    av_frame_free(&in);
> +    av_frame_free(&out);
> +    return ret;
> +}
> +
> +static AVFrame *cudaformat_get_video_buffer(AVFilterLink *inlink, int w, int h)
> +{
> +    CUDAFormatContext *s = inlink->dst->priv;
> +
> +    return s->passthrough ?
> +        ff_null_get_video_buffer   (inlink, w, h) :
> +        ff_default_get_video_buffer(inlink, w, h);
> +}
> +
> +#define OFFSET(x) offsetof(CUDAFormatContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption options[] = {
> +    { "format", "Output pixel format", OFFSET(format), AV_OPT_TYPE_PIXEL_FMT, { .i64 = AV_PIX_FMT_NONE }, AV_PIX_FMT_NONE, INT_MAX, FLAGS },
> +    { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
> +    { NULL },
> +};
> +
> +static const AVClass cudaformat_class = {
> +    .class_name = "cudaformat",
> +    .item_name  = av_default_item_name,
> +    .option     = options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +static const AVFilterPad cudaformat_inputs[] = {
> +    {
> +        .name        = "default",
> +        .type        = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = cudaformat_filter_frame,
> +        .get_video_buffer = cudaformat_get_video_buffer,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad cudaformat_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = cudaformat_config_props,
> +    },
> +    { NULL }
> +};
> +
> +const AVFilter ff_vf_format_cuda = {
> +    .name      = "format_cuda",
> +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video format conversion"),
> +
> +    .init          = cudaformat_init,
> +    .uninit        = cudaformat_uninit,
> +    .query_formats = cudaformat_query_formats,
> +
> +    .priv_size = sizeof(CUDAFormatContext),
> +    .priv_class = &cudaformat_class,
> +
> +    .inputs    = cudaformat_inputs,
> +    .outputs   = cudaformat_outputs,
> +
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> diff --git a/libavfilter/vf_format_cuda.cu b/libavfilter/vf_format_cuda.cu
> new file mode 100644
> index 0000000000..f706f9b13d
> --- /dev/null
> +++ b/libavfilter/vf_format_cuda.cu
> @@ -0,0 +1,849 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "cuda/vector_helpers.cuh"
> +
> +static const ushort mask_10bit = 0xFFC0;
> +static const ushort mask_16bit = 0xFFFF;
> +
> +static inline __device__ ushort conv_8to16(uchar in, ushort mask)
> +{
> +    return ((ushort)in | ((ushort)in << 8)) & mask;
> +}
> +
> +static inline __device__ ushort2 conv_8to16(uchar2 in, ushort mask)
> +{
> +    return make_ushort2(
> +        conv_8to16(in.x, mask),
> +        conv_8to16(in.y, mask)
> +    );
> +}
> +
> +static inline __device__ uchar conv_16to8(ushort in)
> +{
> +    return in >> 8;
> +}
> +
> +static inline __device__ uchar2 conv_16to8(ushort2 in)
> +{
> +    return make_uchar2(
> +        conv_16to8(in.x),
> +        conv_16to8(in.y)
> +    );
> +}
> +
> +static inline __device__ uchar conv_10to8(ushort in)
> +{
> +    return in >> 8;
> +}
> +
> +static inline __device__ uchar2 conv_10to8(ushort2 in)
> +{
> +    return make_uchar2(
> +        conv_10to8(in.x),
> +        conv_10to8(in.y)
> +    );
> +}
> +
> +static inline __device__ ushort conv_10to16(ushort in)
> +{
> +    return in | (in >> 10);
> +}
> +
> +static inline __device__ ushort2 conv_10to16(ushort2 in)
> +{
> +    return make_ushort2(
> +        conv_10to16(in.x),
> +        conv_10to16(in.y)
> +    );
> +}
> +
> +static inline __device__ ushort conv_16to10(ushort in)
> +{
> +    return in & mask_10bit;
> +}
> +
> +static inline __device__ ushort2 conv_16to10(ushort2 in)
> +{
> +    return make_ushort2(
> +        conv_16to10(in.x),
> +        conv_16to10(in.y)
> +    );
> +}
> +
> +template<typename T>
> +static inline __device__ T conv_444to420(const T *src, int pitch, int x, int y)
> +{
> +    unsigned tmp = (unsigned)src[ y      * pitch +  x] +
> +                   (unsigned)src[(y + 1) * pitch +  x] +
> +                   (unsigned)src[ y      * pitch + (x + 1)] +
> +                   (unsigned)src[(y + 1) * pitch + (x + 1)];
> +    return tmp / 4;
> +}
> +
> +static inline __device__ ushort conv_444to420p16(const uchar *src, int pitch, int x, int y, ushort mask)
> +{
> +    unsigned tmp = (unsigned)conv_8to16(src[ y      * pitch +  x], mask_16bit) +
> +                   (unsigned)conv_8to16(src[(y + 1) * pitch +  x], mask_16bit) +
> +                   (unsigned)conv_8to16(src[ y      * pitch + (x + 1)], mask_16bit) +
> +                   (unsigned)conv_8to16(src[(y + 1) * pitch + (x + 1)], mask_16bit);
> +    return (tmp / 4) & mask;
> +}
> +
> +template<typename T>
> +static inline __device__ T conv_420to444(const T *src, int width, int height, int pitch, int x, int y)
> +{
> +    int x1 = x / 2;
> +    int y1 = y / 2;
> +    int x2 = min(x1 + 1, width - 1);
> +    int y2 = min(y1 + 1, height - 1);
> +
> +    intT tmp;
> +    vec_set_scalar(tmp, 0);
> +    tmp += to_intN<T, intT>(src[y1 * pitch + x1]);
> +    tmp += to_intN<T, intT>(src[y1 * pitch + x2]);
> +    tmp += to_intN<T, intT>(src[y2 * pitch + x1]);
> +    tmp += to_intN<T, intT>(src[y2 * pitch + x2]);
> +
> +    return from_intN<T, intT>(tmp / 4);
> +}
> +
> +template<typename T, typename O>
> +static inline __device__ O conv_420to444p16(const T *src, int width, int height, int pitch, int x, int y, ushort mask)
> +{
> +    int x1 = x / 2;
> +    int y1 = y / 2;
> +    int x2 = min(x1 + 1, (width / 2) - 1);
> +    int y2 = min(y1 + 1, (height / 2) - 1);
> +
> +    intT tmp;
> +    vec_set_scalar(tmp, 0);
> +    tmp += to_intN<O, intT>(conv_8to16(src[y1 * pitch + x1], mask_16bit));
> +    tmp += to_intN<O, intT>(conv_8to16(src[y1 * pitch + x2], mask_16bit));
> +    tmp += to_intN<O, intT>(conv_8to16(src[y2 * pitch + x1], mask_16bit));
> +    tmp += to_intN<O, intT>(conv_8to16(src[y2 * pitch + x2], mask_16bit));
> +
> +    return from_intN<O, intT>((tmp / 4) & mask);
> +}
> +
> +#define FIX_PITCH(name) name ## _pitch /= sizeof(*name)
> +
> +// yuv420p->X
> +extern "C" {
> +
> +__global__ void Convert_yuv420p_nv12(int width, int height,
> +                                     uchar  *dst_y  , int dst_y_pitch , const uchar *src_y, int src_y_pitch,
> +                                     uchar2 *dst_uv , int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
> +                                     uchar  *unused0, int unused1     , const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = make_uchar2(
> +            src_u[y * src_u_pitch + x],
> +            src_v[y * src_v_pitch + x]
> +        );
> +    }
> +}
> +
> +__global__ void Convert_yuv420p_yuv444p(int width, int height,
> +                                        uchar *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
> +                                        uchar *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
> +                                        uchar *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        dst_y[y * dst_y_pitch + x] = src_y[y  * src_y_pitch + x];
> +        dst_u[y * dst_u_pitch + x] = conv_420to444(src_u, width, height, src_u_pitch, x, y);
> +        dst_v[y * dst_v_pitch + x] = conv_420to444(src_v, width, height, src_v_pitch, x, y);
> +    }
> +}
> +
> +__global__ void Convert_yuv420p_p010le(int width, int height,
> +                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
> +                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
> +                                       ushort2 *unuse0, int unused_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_10bit);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = make_ushort2(
> +            conv_8to16(src_u[y * src_u_pitch + x], mask_10bit),
> +            conv_8to16(src_v[y * src_v_pitch + x], mask_10bit)
> +        );
> +    }
> +}
> +
> +__global__ void Convert_yuv420p_p016le(int width, int height,
> +                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
> +                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
> +                                       ushort2 *unuse0, int unused_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = make_ushort2(
> +            conv_8to16(src_u[y * src_u_pitch + x], mask_16bit),
> +            conv_8to16(src_v[y * src_v_pitch + x], mask_16bit)
> +        );
> +    }
> +}
> +
> +__global__ void Convert_yuv420p_yuv444p16le(int width, int height,
> +                                            ushort *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
> +                                            ushort *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
> +                                            ushort *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(dst_u);
> +        FIX_PITCH(dst_v);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y  * src_y_pitch + x],  mask_16bit);
> +        dst_u[y * dst_u_pitch + x] = conv_420to444p16<uchar, ushort>(src_u, width, height, src_u_pitch, x, y, mask_16bit);
> +        dst_v[y * dst_v_pitch + x] = conv_420to444p16<uchar, ushort>(src_v, width, height, src_v_pitch, x, y, mask_16bit);
> +    }
> +}
> +
> +}
> +
> +// nv12->X
> +extern "C" {
> +
> +__global__ void Convert_nv12_yuv420p(int width, int height,
> +                                     uchar *dst_y, int dst_y_pitch, const uchar  *src_y,  int src_y_pitch,
> +                                     uchar *dst_u, int dst_u_pitch, const uchar2 *src_uv, int src_uv_pitch,
> +                                     uchar *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(src_uv);
> +
> +        const uchar2 &uv = src_uv[y * src_uv_pitch + x];
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +__global__ void Convert_nv12_yuv444p(int width, int height,
> +                                     uchar *dst_y, int dst_y_pitch, const uchar  *src_y,  int src_y_pitch,
> +                                     uchar *dst_u, int dst_u_pitch, const uchar2 *src_uv, int src_uv_pitch,
> +                                     uchar *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_uv);
> +
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +
> +        uchar2 uv = conv_420to444(src_uv, width, height, src_uv_pitch, x, y);
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +__global__ void Convert_nv12_p010le(int width, int height,
> +                                    ushort  *dst_y,  int dst_y_pitch,  const uchar  *src_y,  int src_y_pitch,
> +                                    ushort2 *dst_uv, int dst_uv_pitch, const uchar2 *src_uv, int src_uv_pitch,
> +                                    ushort2 *unuse0, int unused_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_10bit);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(src_uv);
> +        FIX_PITCH(dst_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = conv_8to16(src_uv[y * src_uv_pitch + x], mask_10bit);
> +    }
> +}
> +
> +__global__ void Convert_nv12_p016le(int width, int height,
> +                                    ushort  *dst_y,  int dst_y_pitch,  const uchar  *src_y,  int src_y_pitch,
> +                                    ushort2 *dst_uv, int dst_uv_pitch, const uchar2 *src_uv, int src_uv_pitch,
> +                                    ushort2 *unuse0, int unused_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(src_uv);
> +        FIX_PITCH(dst_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = conv_8to16(src_uv[y * src_uv_pitch + x], mask_16bit);
> +    }
> +}
> +
> +__global__ void Convert_nv12_yuv444p16le(int width, int height,
> +                                         ushort *dst_y, int dst_y_pitch, const uchar  *src_y,  int src_y_pitch,
> +                                         ushort *dst_u, int dst_u_pitch, const uchar2 *src_uv, int src_uv_pitch,
> +                                         ushort *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_uv);
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(dst_u);
> +        FIX_PITCH(dst_v);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
> +
> +        ushort2 uv = conv_420to444p16<uchar2, ushort2>(src_uv, width, height, src_uv_pitch, x, y, mask_16bit);
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +}
> +
> +// yuv444p->X
> +extern "C" {
> +
> +__global__ void Convert_yuv444p_yuv420p(int width, int height,
> +                                        uchar *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
> +                                        uchar *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
> +                                        uchar *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +
> +            dst_u[y2 * dst_u_pitch + x2] = conv_444to420(src_u, src_u_pitch, x, y);
> +            dst_v[y2 * dst_v_pitch + x2] = conv_444to420(src_v, src_v_pitch, x, y);
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p_nv12(int width, int height,
> +                                     uchar  *dst_y , int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
> +                                     uchar2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
> +                                     uchar2 *unused, int unused_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(dst_uv);
> +
> +            dst_uv[y2 * dst_uv_pitch + x2] = make_uchar2(
> +                conv_444to420(src_u, src_u_pitch, x, y),
> +                conv_444to420(src_v, src_v_pitch, x, y)
> +            );
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p_p010le(int width, int height,
> +                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
> +                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
> +                                       ushort2 *unused, int unused_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_10bit);
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(dst_uv);
> +
> +            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
> +                conv_444to420p16(src_u, src_u_pitch, x, y, mask_10bit),
> +                conv_444to420p16(src_v, src_v_pitch, x, y, mask_10bit)
> +            );
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p_p016le(int width, int height,
> +                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
> +                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
> +                                       ushort2 *unused, int unused_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(dst_uv);
> +
> +            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
> +                conv_444to420p16(src_u, src_u_pitch, x, y, mask_16bit),
> +                conv_444to420p16(src_v, src_v_pitch, x, y, mask_16bit)
> +            );
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p_yuv444p16le(int width, int height,
> +                                            ushort *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
> +                                            ushort *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
> +                                            ushort *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(dst_u);
> +        FIX_PITCH(dst_v);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
> +        dst_u[y * dst_u_pitch + x] = conv_8to16(src_u[y * src_u_pitch + x], mask_16bit);
> +        dst_v[y * dst_v_pitch + x] = conv_8to16(src_v[y * src_v_pitch + x], mask_16bit);
> +    }
> +}
> +
> +}
> +
> +// p010le->X
> +extern "C" {
> +
> +__global__ void Convert_p010le_yuv420p(int width, int height,
> +                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
> +                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
> +                                       uchar *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_10to8(src_y[y * src_y_pitch + x]);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(src_uv);
> +
> +        uchar2 uv = conv_10to8(src_uv[y * src_uv_pitch + x]);
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +__global__ void Convert_p010le_nv12(int width, int height,
> +                                    uchar  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
> +                                    uchar2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_10to8(src_y[y * src_y_pitch + x]);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +        FIX_PITCH(src_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = conv_10to8(src_uv[y * src_uv_pitch + x]);
> +    }
> +}
> +
> +__global__ void Convert_p010le_yuv444p(int width, int height,
> +                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
> +                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
> +                                       uchar *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(src_uv);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_10to8(src_y[y * src_y_pitch + x]);
> +
> +        uchar2 uv = conv_10to8(conv_420to444(src_uv, width, height, src_uv_pitch, x, y));
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +__global__ void Convert_p010le_p016le(int width, int height,
> +                                      ushort  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
> +                                      ushort2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_10to16(src_y[y * src_y_pitch + x]);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +        FIX_PITCH(src_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = conv_10to16(src_uv[y * src_uv_pitch + x]);
> +    }
> +}
> +
> +__global__ void Convert_p010le_yuv444p16le(int width, int height,
> +                                           ushort *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
> +                                           ushort *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
> +                                           ushort *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(dst_u);
> +        FIX_PITCH(dst_v);
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(src_uv);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_10to16(src_y[y * src_y_pitch + x]);
> +
> +        ushort2 uv = conv_10to16(conv_420to444(src_uv, width, height, src_uv_pitch, x, y));
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +}
> +
> +// p016le->X
> +extern "C" {
> +
> +__global__ void Convert_p016le_yuv420p(int width, int height,
> +                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
> +                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
> +                                       uchar *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(src_uv);
> +
> +        uchar2 uv = conv_16to8(src_uv[y * src_uv_pitch + x]);
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +__global__ void Convert_p016le_nv12(int width, int height,
> +                                    uchar  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
> +                                    uchar2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +        FIX_PITCH(src_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = conv_16to8(src_uv[y * src_uv_pitch + x]);
> +    }
> +}
> +
> +__global__ void Convert_p016le_yuv444p(int width, int height,
> +                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
> +                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
> +                                       uchar *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(src_uv);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
> +
> +        uchar2 uv = conv_16to8(conv_420to444(src_uv, width, height, src_uv_pitch, x, y));
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +__global__ void Convert_p016le_p010le(int width, int height,
> +                                      ushort  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
> +                                      ushort2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_16to10(src_y[y * src_y_pitch + x]);
> +    }
> +
> +    if (x < width / 2 && y < height / 2) {
> +        FIX_PITCH(dst_uv);
> +        FIX_PITCH(src_uv);
> +
> +        dst_uv[y * dst_uv_pitch + x] = conv_16to10(src_uv[y * src_uv_pitch + x]);
> +    }
> +}
> +
> +__global__ void Convert_p016le_yuv444p16le(int width, int height,
> +                                           ushort *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
> +                                           ushort *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
> +                                           ushort *dst_v, int dst_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(dst_y);
> +        FIX_PITCH(dst_u);
> +        FIX_PITCH(dst_v);
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(src_uv);
> +
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +
> +        ushort2 uv = conv_420to444(src_uv, width, height, src_uv_pitch, x, y);
> +        dst_u[y * dst_u_pitch + x] = uv.x;
> +        dst_v[y * dst_v_pitch + x] = uv.y;
> +    }
> +}
> +
> +}
> +
> +// yuv444p16le->X
> +extern "C" {
> +
> +__global__ void Convert_yuv444p16le_yuv420p(int width, int height,
> +                                            uchar *dst_y, int dst_y_pitch, const ushort *src_y, int src_y_pitch,
> +                                            uchar *dst_u, int dst_u_pitch, const ushort *src_u, int src_u_pitch,
> +                                            uchar *dst_v, int dst_v_pitch, const ushort *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(src_u);
> +            FIX_PITCH(src_v);
> +
> +            dst_u[y2 * dst_u_pitch + x2] = conv_16to8(conv_444to420(src_u, src_u_pitch, x, y));
> +            dst_v[y2 * dst_v_pitch + x2] = conv_16to8(conv_444to420(src_v, src_v_pitch, x, y));
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p16le_nv12(int width, int height,
> +                                         uchar  *dst_y , int dst_y_pitch,  const ushort *src_y, int src_y_pitch,
> +                                         uchar2 *dst_uv, int dst_uv_pitch, const ushort *src_u, int src_u_pitch,
> +                                         uchar2 *unused, int unused_pitch, const ushort *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(src_u);
> +            FIX_PITCH(src_v);
> +            FIX_PITCH(dst_uv);
> +
> +            dst_uv[y2 * dst_uv_pitch + x2] = make_uchar2(
> +                conv_16to8(conv_444to420(src_u, src_u_pitch, x, y)),
> +                conv_16to8(conv_444to420(src_v, src_v_pitch, x, y))
> +            );
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p16le_yuv444p(int width, int height,
> +                                            uchar *dst_y, int dst_y_pitch, const ushort *src_y, int src_y_pitch,
> +                                            uchar *dst_u, int dst_u_pitch, const ushort *src_u, int src_u_pitch,
> +                                            uchar *dst_v, int dst_v_pitch, const ushort *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(src_u);
> +        FIX_PITCH(src_v);
> +
> +        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
> +        dst_u[y * dst_u_pitch + x] = conv_16to8(src_u[y * src_u_pitch + x]);
> +        dst_v[y * dst_v_pitch + x] = conv_16to8(src_v[y * src_v_pitch + x]);
> +    }
> +}
> +
> +__global__ void Convert_yuv444p16le_p010le(int width, int height,
> +                                           ushort  *dst_y,  int dst_y_pitch,  const ushort *src_y, int src_y_pitch,
> +                                           ushort2 *dst_uv, int dst_uv_pitch, const ushort *src_u, int src_u_pitch,
> +                                           ushort2 *unused, int unused_pitch, const ushort *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(dst_y);
> +        dst_y[y * dst_y_pitch + x] = conv_16to10(src_y[y * src_y_pitch + x]);
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(src_u);
> +            FIX_PITCH(src_v);
> +            FIX_PITCH(dst_uv);
> +
> +            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
> +                conv_16to10(conv_444to420(src_u, src_u_pitch, x, y)),
> +                conv_16to10(conv_444to420(src_v, src_v_pitch, x, y))
> +            );
> +        }
> +    }
> +}
> +
> +__global__ void Convert_yuv444p16le_p016le(int width, int height,
> +                                           ushort  *dst_y,  int dst_y_pitch,  const ushort *src_y, int src_y_pitch,
> +                                           ushort2 *dst_uv, int dst_uv_pitch, const ushort *src_u, int src_u_pitch,
> +                                           ushort2 *unused, int unused_pitch, const ushort *src_v, int src_v_pitch)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (x < width && y < height) {
> +        FIX_PITCH(src_y);
> +        FIX_PITCH(dst_y);
> +        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
> +
> +        if ((x & 1) == 0 && (y & 1) == 0) {
> +            int x2 = x / 2;
> +            int y2 = y / 2;
> +            FIX_PITCH(src_u);
> +            FIX_PITCH(src_v);
> +            FIX_PITCH(dst_uv);
> +
> +            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
> +                conv_444to420(src_u, src_u_pitch, x, y),
> +                conv_444to420(src_v, src_v_pitch, x, y)
> +            );
> +        }
> +    }
> +}
> +
> +}
> -- 
> 2.25.1
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 

Thanks
Steven
Timo Rothenpieler June 11, 2021, 10:26 p.m. UTC | #2
On 11.06.2021 17:33, Steven Liu wrote:
> 
> 
>> 在 2021年6月11日,22:43,Timo Rothenpieler <timo@rothenpieler.org> 写道:
> Hi Timo,
>>
>> ---
>> configure                           |   2 +
>> doc/filters.texi                    |  46 ++
>> libavfilter/Makefile                |   1 +
>> libavfilter/allfilters.c            |   1 +
>> libavfilter/cuda/vector_helpers.cuh |  14 +-
>> libavfilter/version.h               |   2 +-
>> libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
>> libavfilter/vf_format_cuda.cu       | 849 ++++++++++++++++++++++++++++
>> 8 files changed, 1345 insertions(+), 3 deletions(-)
>> create mode 100644 libavfilter/vf_format_cuda.c
>> create mode 100644 libavfilter/vf_format_cuda.cu
>>
>> diff --git a/configure b/configure
>> index 6bfd98b384..7041c09177 100755
>> --- a/configure
>> +++ b/configure
>> @@ -3078,6 +3078,8 @@ qsvvpp_select="qsv"
>> vaapi_encode_deps="vaapi"
>> v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>>
>> +format_cuda_filter_deps="ffnvcodec"
>> +format_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>> hwupload_cuda_filter_deps="ffnvcodec"
>> scale_npp_filter_deps="ffnvcodec libnpp"
>> scale_cuda_filter_deps="ffnvcodec"
>> diff --git a/doc/filters.texi b/doc/filters.texi
>> index 78faf767cf..27a0184cb3 100644
>> --- a/doc/filters.texi
>> +++ b/doc/filters.texi
>> @@ -12324,6 +12324,52 @@ format=pix_fmts=yuv420p|yuv444p|yuv410p
>> @end example
>> @end itemize
>>
>> +@anchor{format_cuda}
>> +@section format_cuda
>> +
>> +Convert the input video to one of the specified pixel formats.
>> +Libavfilter will try to pick one that is suitable as input to
>> +the next filter.
>> +
>> +It accepts the following parameters:
>> +@table @option
>> +
>> +@item format
>> +The name of the desired output format.
>> +If none is specified, the input format will be used.
>> +
>> +@item passthrough
>> +If true, input frames matching the output format will be passed
>> +through as-is.
>> +If false, every input frame is processed. Frames matching the
>> +output format will be copied without data modification. This is
>> +the default mode.
>> +
>> +@end table
>> +
>> +@subsection Examples
>> +
>> +@itemize
>> +@item
>> +Convert the input video to the @var{yuv420p} format.
>> +@example
>> +format_cuda=yuv420p
> maybe need an format for alpha blend, be used in overlay_cuda for colorkey linkly feature.

I had this lying around for a long time. It's also lacking support for 
the RGB formats.
Planned to enhance this, but never got around to it.

Kinda just want to get this merged as-is now, to enable work on 
enhancements of the filter.

Adding the alpha formats should be fairly simple, though I'm not sure 
what to do in the conversion. Should it just discard the alpha channel 
in the one direction, and add a fully opaque one the other way around?
Lynne June 12, 2021, 5:07 a.m. UTC | #3
Jun 12, 2021, 00:26 by timo@rothenpieler.org:

> On 11.06.2021 17:33, Steven Liu wrote:
>
>>
>>
>>> 在 2021年6月11日,22:43,Timo Rothenpieler <timo@rothenpieler.org> 写道:
>>>
>> Hi Timo,
>>
>>>
>>> ---
>>> configure                           |   2 +
>>> doc/filters.texi                    |  46 ++
>>> libavfilter/Makefile                |   1 +
>>> libavfilter/allfilters.c            |   1 +
>>> libavfilter/cuda/vector_helpers.cuh |  14 +-
>>> libavfilter/version.h               |   2 +-
>>> libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
>>> libavfilter/vf_format_cuda.cu       | 849 ++++++++++++++++++++++++++++
>>> 8 files changed, 1345 insertions(+), 3 deletions(-)
>>> create mode 100644 libavfilter/vf_format_cuda.c
>>> create mode 100644 libavfilter/vf_format_cuda.cu
>>>
>>> diff --git a/configure b/configure
>>> index 6bfd98b384..7041c09177 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -3078,6 +3078,8 @@ qsvvpp_select="qsv"
>>> vaapi_encode_deps="vaapi"
>>> v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>>>
>>> +format_cuda_filter_deps="ffnvcodec"
>>> +format_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>>> hwupload_cuda_filter_deps="ffnvcodec"
>>> scale_npp_filter_deps="ffnvcodec libnpp"
>>> scale_cuda_filter_deps="ffnvcodec"
>>> diff --git a/doc/filters.texi b/doc/filters.texi
>>> index 78faf767cf..27a0184cb3 100644
>>> --- a/doc/filters.texi
>>> +++ b/doc/filters.texi
>>> @@ -12324,6 +12324,52 @@ format=pix_fmts=yuv420p|yuv444p|yuv410p
>>> @end example
>>> @end itemize
>>>
>>> +@anchor{format_cuda}
>>> +@section format_cuda
>>> +
>>> +Convert the input video to one of the specified pixel formats.
>>> +Libavfilter will try to pick one that is suitable as input to
>>> +the next filter.
>>> +
>>> +It accepts the following parameters:
>>> +@table @option
>>> +
>>> +@item format
>>> +The name of the desired output format.
>>> +If none is specified, the input format will be used.
>>> +
>>> +@item passthrough
>>> +If true, input frames matching the output format will be passed
>>> +through as-is.
>>> +If false, every input frame is processed. Frames matching the
>>> +output format will be copied without data modification. This is
>>> +the default mode.
>>> +
>>> +@end table
>>> +
>>> +@subsection Examples
>>> +
>>> +@itemize
>>> +@item
>>> +Convert the input video to the @var{yuv420p} format.
>>> +@example
>>> +format_cuda=yuv420p
>>>
>> maybe need an format for alpha blend, be used in overlay_cuda for colorkey linkly feature.
>>
>
> I had this lying around for a long time. It's also lacking support for the RGB formats.
> Planned to enhance this, but never got around to it.
>
> Kinda just want to get this merged as-is now, to enable work on enhancements of the filter.
>
> Adding the alpha formats should be fairly simple, though I'm not sure what to do in the conversion. Should it just discard the alpha channel in the one direction, and add a fully opaque one the other way around?
>

Just a nit, could the filter get merged into scale_cuda? I'm used to how
other scale filters work.
Timo Rothenpieler June 12, 2021, 11:17 a.m. UTC | #4
On 12.06.2021 07:07, Lynne wrote:
> Jun 12, 2021, 00:26 by timo@rothenpieler.org:
> 
>> On 11.06.2021 17:33, Steven Liu wrote:
>>
>>>
>>>
>>>> 在 2021年6月11日,22:43,Timo Rothenpieler <timo@rothenpieler.org> 写道:
>>>>
>>> Hi Timo,
>>>
>>>>
>>>> ---
>>>> configure                           |   2 +
>>>> doc/filters.texi                    |  46 ++
>>>> libavfilter/Makefile                |   1 +
>>>> libavfilter/allfilters.c            |   1 +
>>>> libavfilter/cuda/vector_helpers.cuh |  14 +-
>>>> libavfilter/version.h               |   2 +-
>>>> libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
>>>> libavfilter/vf_format_cuda.cu       | 849 ++++++++++++++++++++++++++++
>>>> 8 files changed, 1345 insertions(+), 3 deletions(-)
>>>> create mode 100644 libavfilter/vf_format_cuda.c
>>>> create mode 100644 libavfilter/vf_format_cuda.cu
>>>>
>>>> diff --git a/configure b/configure
>>>> index 6bfd98b384..7041c09177 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -3078,6 +3078,8 @@ qsvvpp_select="qsv"
>>>> vaapi_encode_deps="vaapi"
>>>> v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>>>>
>>>> +format_cuda_filter_deps="ffnvcodec"
>>>> +format_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>>>> hwupload_cuda_filter_deps="ffnvcodec"
>>>> scale_npp_filter_deps="ffnvcodec libnpp"
>>>> scale_cuda_filter_deps="ffnvcodec"
>>>> diff --git a/doc/filters.texi b/doc/filters.texi
>>>> index 78faf767cf..27a0184cb3 100644
>>>> --- a/doc/filters.texi
>>>> +++ b/doc/filters.texi
>>>> @@ -12324,6 +12324,52 @@ format=pix_fmts=yuv420p|yuv444p|yuv410p
>>>> @end example
>>>> @end itemize
>>>>
>>>> +@anchor{format_cuda}
>>>> +@section format_cuda
>>>> +
>>>> +Convert the input video to one of the specified pixel formats.
>>>> +Libavfilter will try to pick one that is suitable as input to
>>>> +the next filter.
>>>> +
>>>> +It accepts the following parameters:
>>>> +@table @option
>>>> +
>>>> +@item format
>>>> +The name of the desired output format.
>>>> +If none is specified, the input format will be used.
>>>> +
>>>> +@item passthrough
>>>> +If true, input frames matching the output format will be passed
>>>> +through as-is.
>>>> +If false, every input frame is processed. Frames matching the
>>>> +output format will be copied without data modification. This is
>>>> +the default mode.
>>>> +
>>>> +@end table
>>>> +
>>>> +@subsection Examples
>>>> +
>>>> +@itemize
>>>> +@item
>>>> +Convert the input video to the @var{yuv420p} format.
>>>> +@example
>>>> +format_cuda=yuv420p
>>>>
>>> maybe need an format for alpha blend, be used in overlay_cuda for colorkey linkly feature.
>>>
>>
>> I had this lying around for a long time. It's also lacking support for the RGB formats.
>> Planned to enhance this, but never got around to it.
>>
>> Kinda just want to get this merged as-is now, to enable work on enhancements of the filter.
>>
>> Adding the alpha formats should be fairly simple, though I'm not sure what to do in the conversion. Should it just discard the alpha channel in the one direction, and add a fully opaque one the other way around?
>>
> 
> Just a nit, could the filter get merged into scale_cuda? I'm used to how
> other scale filters work.

Not easily, unless it gets literally written like two independent 
filters in one, which would remove most of the benefit of having it in a 
single filter.
Philip Langdale June 18, 2021, 8:53 p.m. UTC | #5
On Fri, 11 Jun 2021 16:43:07 +0200
Timo Rothenpieler <timo@rothenpieler.org> wrote:

> ---
>  configure                           |   2 +
>  doc/filters.texi                    |  46 ++
>  libavfilter/Makefile                |   1 +
>  libavfilter/allfilters.c            |   1 +
>  libavfilter/cuda/vector_helpers.cuh |  14 +-
>  libavfilter/version.h               |   2 +-
>  libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
>  libavfilter/vf_format_cuda.cu       | 849
> ++++++++++++++++++++++++++++ 8 files changed, 1345 insertions(+), 3
> deletions(-) create mode 100644 libavfilter/vf_format_cuda.c
>  create mode 100644 libavfilter/vf_format_cuda.cu

LGTM. I agree it's better to get it in and then iterate on adding on
additional formats.

--phil
Timo Rothenpieler June 19, 2021, 8:10 p.m. UTC | #6
On 11.06.2021 16:43, Timo Rothenpieler wrote:
> ---
>   configure                           |   2 +
>   doc/filters.texi                    |  46 ++
>   libavfilter/Makefile                |   1 +
>   libavfilter/allfilters.c            |   1 +
>   libavfilter/cuda/vector_helpers.cuh |  14 +-
>   libavfilter/version.h               |   2 +-
>   libavfilter/vf_format_cuda.c        | 433 ++++++++++++++
>   libavfilter/vf_format_cuda.cu       | 849 ++++++++++++++++++++++++++++
>   8 files changed, 1345 insertions(+), 3 deletions(-)
>   create mode 100644 libavfilter/vf_format_cuda.c
>   create mode 100644 libavfilter/vf_format_cuda.cu

Will push this soon(ish) if nobody objects.

Concerning the name, while I do agree that it internally does something 
very different compared to the software format filter, from a users 
perspective, it does pretty much the same thing.

So giving it a different name, like "convert_format_cuda" or something 
seems needlessly confusing to me.
diff mbox series

Patch

diff --git a/configure b/configure
index 6bfd98b384..7041c09177 100755
--- a/configure
+++ b/configure
@@ -3078,6 +3078,8 @@  qsvvpp_select="qsv"
 vaapi_encode_deps="vaapi"
 v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
 
+format_cuda_filter_deps="ffnvcodec"
+format_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 hwupload_cuda_filter_deps="ffnvcodec"
 scale_npp_filter_deps="ffnvcodec libnpp"
 scale_cuda_filter_deps="ffnvcodec"
diff --git a/doc/filters.texi b/doc/filters.texi
index 78faf767cf..27a0184cb3 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -12324,6 +12324,52 @@  format=pix_fmts=yuv420p|yuv444p|yuv410p
 @end example
 @end itemize
 
+@anchor{format_cuda}
+@section format_cuda
+
+Convert the input video to one of the specified pixel formats.
+Libavfilter will try to pick one that is suitable as input to
+the next filter.
+
+It accepts the following parameters:
+@table @option
+
+@item format
+The name of the desired output format.
+If none is specified, the input format will be used.
+
+@item passthrough
+If true, input frames matching the output format will be passed
+through as-is.
+If false, every input frame is processed. Frames matching the
+output format will be copied without data modification. This is
+the default mode.
+
+@end table
+
+@subsection Examples
+
+@itemize
+@item
+Convert the input video to the @var{yuv420p} format.
+@example
+format_cuda=yuv420p
+@end example
+
+Convert the input video to the @var{yuv420p} format, but pass-through any
+input that already matches the format without touching the frames at all.
+@example
+format_cuda=yuv420p:1
+@end example
+
+With no arguments, every input frame will be copied into a new
+buffer, with no further processing done. This can be useful to decouple
+a decoder with a limited buffer pool from a processing chain with deep buffers.
+@example
+format_cuda
+@end example
+@end itemize
+
 @anchor{fps}
 @section fps
 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index bc81033e3f..1f8331c4f4 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -273,6 +273,7 @@  OBJS-$(CONFIG_FILLBORDERS_FILTER)            += vf_fillborders.o
 OBJS-$(CONFIG_FIND_RECT_FILTER)              += vf_find_rect.o lavfutils.o
 OBJS-$(CONFIG_FLOODFILL_FILTER)              += vf_floodfill.o
 OBJS-$(CONFIG_FORMAT_FILTER)                 += vf_format.o
+OBJS-$(CONFIG_FORMAT_CUDA_FILTER)            += vf_format_cuda.o vf_format_cuda.ptx.o
 OBJS-$(CONFIG_FPS_FILTER)                    += vf_fps.o
 OBJS-$(CONFIG_FRAMEPACK_FILTER)              += vf_framepack.o
 OBJS-$(CONFIG_FRAMERATE_FILTER)              += vf_framerate.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c6afef835f..947214dc25 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -258,6 +258,7 @@  extern const AVFilter ff_vf_fillborders;
 extern const AVFilter ff_vf_find_rect;
 extern const AVFilter ff_vf_floodfill;
 extern const AVFilter ff_vf_format;
+extern const AVFilter ff_vf_format_cuda;
 extern const AVFilter ff_vf_fps;
 extern const AVFilter ff_vf_framepack;
 extern const AVFilter ff_vf_framerate;
diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh
index 67332ef030..8ce67ce579 100644
--- a/libavfilter/cuda/vector_helpers.cuh
+++ b/libavfilter/cuda/vector_helpers.cuh
@@ -42,31 +42,41 @@  template<> struct vector_helper<int4>    { typedef float4 ftype; typedef int4 it
 
 template<typename T, typename V> inline __device__ V to_floatN(const T &a) { return (V)a; }
 template<typename T, typename V> inline __device__ T from_floatN(const V &a) { return (T)a; }
+template<typename T, typename V> inline __device__ V to_intN(const T &a) { return (V)a; }
+template<typename T, typename V> inline __device__ T from_intN(const V &a) { return (T)a; }
 
 #define OPERATORS2(T) \
     template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y); } \
     template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y); } \
     template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b); } \
     template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b); } \
+    template<typename V> inline __device__ T operator&(const T &a, V b) { return make_ ## T (a.x & b, a.y & b); } \
+    template<typename V> inline __device__ T operator|(const T &a, V b) { return make_ ## T (a.x | b, a.y | b); } \
     template<typename V> inline __device__ T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b); } \
     template<typename V> inline __device__ T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b); } \
     template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; return a; } \
     template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; } \
     template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; } \
     template<> inline __device__ float2 to_floatN<T, float2>(const T &a) { return make_float2(a.x, a.y); } \
-    template<> inline __device__ T from_floatN<T, float2>(const float2 &a) { return make_ ## T(a.x, a.y); }
+    template<> inline __device__ T from_floatN<T, float2>(const float2 &a) { return make_ ## T(a.x, a.y); } \
+    template<> inline __device__ int2 to_intN<T, int2>(const T &a) { return make_int2(a.x, a.y); } \
+    template<> inline __device__ T from_intN<T, int2>(const int2 &a) { return make_ ## T(a.x, a.y); }
 #define OPERATORS4(T) \
     template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } \
     template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } \
     template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b, a.z * b, a.w * b); } \
     template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b, a.z / b, a.w / b); } \
+    template<typename V> inline __device__ T operator&(const T &a, V b) { return make_ ## T (a.x & b, a.y & b, a.z & b, a.w & b); } \
+    template<typename V> inline __device__ T operator|(const T &a, V b) { return make_ ## T (a.x | b, a.y | b, a.z | b, a.w | b); } \
     template<typename V> inline __device__ T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b, a.z >> b, a.w >> b); } \
     template<typename V> inline __device__ T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b, a.z << b, a.w << b); } \
     template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } \
     template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w; } \
     template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; a.z = b; a.w = b; } \
     template<> inline __device__ float4 to_floatN<T, float4>(const T &a) { return make_float4(a.x, a.y, a.z, a.w); } \
-    template<> inline __device__ T from_floatN<T, float4>(const float4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); }
+    template<> inline __device__ T from_floatN<T, float4>(const float4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); } \
+    template<> inline __device__ int4 to_intN<T, int4>(const T &a) { return make_int4(a.x, a.y, a.z, a.w); } \
+    template<> inline __device__ T from_intN<T, int4>(const int4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); }
 
 OPERATORS2(int2)
 OPERATORS2(uchar2)
diff --git a/libavfilter/version.h b/libavfilter/version.h
index 5052681653..fbb81ef31c 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@ 
 
 #define LIBAVFILTER_VERSION_MAJOR   8
 #define LIBAVFILTER_VERSION_MINOR   0
-#define LIBAVFILTER_VERSION_MICRO 102
+#define LIBAVFILTER_VERSION_MICRO 103
 
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
diff --git a/libavfilter/vf_format_cuda.c b/libavfilter/vf_format_cuda.c
new file mode 100644
index 0000000000..89f05b1350
--- /dev/null
+++ b/libavfilter/vf_format_cuda.c
@@ -0,0 +1,433 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <float.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P016,
+    AV_PIX_FMT_YUV444P16,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct CUDAFormatContext {
+    const AVClass *class;
+
+    AVCUDADeviceContext *hwctx;
+
+    enum AVPixelFormat in_fmt;
+    enum AVPixelFormat out_fmt;
+
+    AVBufferRef *frames_ctx;
+    AVFrame     *frame;
+
+    AVFrame *tmp_frame;
+    int passthrough;
+
+    /**
+     * Output sw format. AV_PIX_FMT_NONE for no conversion.
+     */
+    enum AVPixelFormat format;
+
+    CUcontext   cu_ctx;
+    CUmodule    cu_module;
+    CUfunction  cu_func_convert;
+    CUstream    cu_stream;
+} CUDAFormatContext;
+
+static av_cold int cudaformat_init(AVFilterContext *ctx)
+{
+    CUDAFormatContext *s = ctx->priv;
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void cudaformat_uninit(AVFilterContext *ctx)
+{
+    CUDAFormatContext *s = ctx->priv;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_frame_free(&s->frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static int cudaformat_query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pixel_formats[] = {
+        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
+    };
+    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
+    if (!pix_fmts)
+        return AVERROR(ENOMEM);
+
+    return ff_set_common_formats(ctx, pix_fmts);
+}
+
+static av_cold int init_hwframe_ctx(CUDAFormatContext *s, AVBufferRef *device_ctx, int width, int height)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format    = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->out_fmt;
+    out_ctx->width     = FFALIGN(width,  32);
+    out_ctx->height    = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width  = width;
+    s->frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static av_cold int init_processing_chain(AVFilterContext *ctx, int width, int height)
+{
+    CUDAFormatContext *s = ctx->priv;
+
+    AVHWFramesContext *in_frames_ctx;
+
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    int ret;
+
+    /* check that we have a hw context */
+    if (!ctx->inputs[0]->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
+    in_format     = in_frames_ctx->sw_format;
+    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->out_fmt = out_format;
+
+    if (s->passthrough && in_format == out_format) {
+        s->frames_ctx = av_buffer_ref(ctx->inputs[0]->hw_frames_ctx);
+        if (!s->frames_ctx)
+            return AVERROR(ENOMEM);
+    } else {
+        s->passthrough = 0;
+
+        ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
+        if (ret < 0)
+            return ret;
+    }
+
+    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!ctx->outputs[0]->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int cudaformat_config_props(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    CUDAFormatContext *s = ctx->priv;
+    AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
+    const char *in_fmt_name, *out_fmt_name;
+    char buf[64];
+    int ret;
+
+    extern char vf_format_cuda_ptx[];
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    ret = init_processing_chain(ctx, inlink->w, inlink->h);
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_format_cuda_ptx));
+    if (ret < 0)
+        goto fail;
+
+    in_fmt_name = av_get_pix_fmt_name(s->in_fmt);
+    out_fmt_name = av_get_pix_fmt_name(s->out_fmt);
+    snprintf(buf, sizeof(buf), "Convert_%s_%s", in_fmt_name, out_fmt_name);
+
+    if (s->in_fmt != s->out_fmt) {
+        av_log(ctx, AV_LOG_DEBUG, "Loading conversion kernel: %s\n", buf);
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_convert, s->cu_module, buf));
+        if (ret < 0)
+            goto fail;
+    }
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+    av_log(ctx, AV_LOG_VERBOSE, "%s -> %s%s\n",
+           in_fmt_name, out_fmt_name, s->passthrough ? " (passthrough)" : "");
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int call_conversion_kernel(AVFilterContext *ctx,
+                                  AVFrame *out, AVFrame *in)
+{
+    CUDAFormatContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    int ret;
+
+    CUdeviceptr dst_devptrs[] = { (CUdeviceptr)out->data[0], (CUdeviceptr)out->data[1],
+                                  (CUdeviceptr)out->data[2], (CUdeviceptr)out->data[3] };
+    CUdeviceptr src_devptrs[] = { (CUdeviceptr)in->data[0],  (CUdeviceptr)in->data[1],
+                                  (CUdeviceptr)in->data[2],  (CUdeviceptr)in->data[3] };
+    void *args[] = { &in->width, &in->height,
+                     &dst_devptrs[0], &out->linesize[0], &src_devptrs[0], &in->linesize[0],
+                     &dst_devptrs[1], &out->linesize[1], &src_devptrs[1], &in->linesize[1],
+                     &dst_devptrs[2], &out->linesize[2], &src_devptrs[2], &in->linesize[2],
+                     &dst_devptrs[3], &out->linesize[3], &src_devptrs[3], &in->linesize[3] };
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func_convert,
+                                      DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int cudaformat_convert(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
+{
+    CUDAFormatContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *src = in;
+    int ret;
+
+    if (s->in_fmt != s->out_fmt)
+        ret = call_conversion_kernel(ctx, s->frame, src);
+    else
+        ret = av_hwframe_transfer_data(s->frame, src, 0);
+    if (ret < 0)
+        return ret;
+
+    src = s->frame;
+    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
+    if (ret < 0)
+        return ret;
+
+    av_frame_move_ref(out, s->frame);
+    av_frame_move_ref(s->frame, s->tmp_frame);
+
+    s->frame->width  = outlink->w;
+    s->frame->height = outlink->h;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int cudaformat_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext       *ctx = link->dst;
+    CUDAFormatContext        *s = ctx->priv;
+    AVFilterLink      *outlink = ctx->outputs[0];
+    CudaFunctions          *cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    if (s->passthrough)
+        return ff_filter_frame(outlink, in);
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = cudaformat_convert(ctx, out, in);
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+static AVFrame *cudaformat_get_video_buffer(AVFilterLink *inlink, int w, int h)
+{
+    CUDAFormatContext *s = inlink->dst->priv;
+
+    return s->passthrough ?
+        ff_null_get_video_buffer   (inlink, w, h) :
+        ff_default_get_video_buffer(inlink, w, h);
+}
+
+#define OFFSET(x) offsetof(CUDAFormatContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption options[] = {
+    { "format", "Output pixel format", OFFSET(format), AV_OPT_TYPE_PIXEL_FMT, { .i64 = AV_PIX_FMT_NONE }, AV_PIX_FMT_NONE, INT_MAX, FLAGS },
+    { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass cudaformat_class = {
+    .class_name = "cudaformat",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad cudaformat_inputs[] = {
+    {
+        .name        = "default",
+        .type        = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = cudaformat_filter_frame,
+        .get_video_buffer = cudaformat_get_video_buffer,
+    },
+    { NULL }
+};
+
+static const AVFilterPad cudaformat_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = cudaformat_config_props,
+    },
+    { NULL }
+};
+
+const AVFilter ff_vf_format_cuda = {
+    .name      = "format_cuda",
+    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video format conversion"),
+
+    .init          = cudaformat_init,
+    .uninit        = cudaformat_uninit,
+    .query_formats = cudaformat_query_formats,
+
+    .priv_size = sizeof(CUDAFormatContext),
+    .priv_class = &cudaformat_class,
+
+    .inputs    = cudaformat_inputs,
+    .outputs   = cudaformat_outputs,
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_format_cuda.cu b/libavfilter/vf_format_cuda.cu
new file mode 100644
index 0000000000..f706f9b13d
--- /dev/null
+++ b/libavfilter/vf_format_cuda.cu
@@ -0,0 +1,849 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+static const ushort mask_10bit = 0xFFC0;
+static const ushort mask_16bit = 0xFFFF;
+
+static inline __device__ ushort conv_8to16(uchar in, ushort mask)
+{
+    return ((ushort)in | ((ushort)in << 8)) & mask;
+}
+
+static inline __device__ ushort2 conv_8to16(uchar2 in, ushort mask)
+{
+    return make_ushort2(
+        conv_8to16(in.x, mask),
+        conv_8to16(in.y, mask)
+    );
+}
+
+static inline __device__ uchar conv_16to8(ushort in)
+{
+    return in >> 8;
+}
+
+static inline __device__ uchar2 conv_16to8(ushort2 in)
+{
+    return make_uchar2(
+        conv_16to8(in.x),
+        conv_16to8(in.y)
+    );
+}
+
+static inline __device__ uchar conv_10to8(ushort in)
+{
+    return in >> 8;
+}
+
+static inline __device__ uchar2 conv_10to8(ushort2 in)
+{
+    return make_uchar2(
+        conv_10to8(in.x),
+        conv_10to8(in.y)
+    );
+}
+
+static inline __device__ ushort conv_10to16(ushort in)
+{
+    return in | (in >> 10);
+}
+
+static inline __device__ ushort2 conv_10to16(ushort2 in)
+{
+    return make_ushort2(
+        conv_10to16(in.x),
+        conv_10to16(in.y)
+    );
+}
+
+static inline __device__ ushort conv_16to10(ushort in)
+{
+    return in & mask_10bit;
+}
+
+static inline __device__ ushort2 conv_16to10(ushort2 in)
+{
+    return make_ushort2(
+        conv_16to10(in.x),
+        conv_16to10(in.y)
+    );
+}
+
+template<typename T>
+static inline __device__ T conv_444to420(const T *src, int pitch, int x, int y)
+{
+    unsigned tmp = (unsigned)src[ y      * pitch +  x] +
+                   (unsigned)src[(y + 1) * pitch +  x] +
+                   (unsigned)src[ y      * pitch + (x + 1)] +
+                   (unsigned)src[(y + 1) * pitch + (x + 1)];
+    return tmp / 4;
+}
+
+static inline __device__ ushort conv_444to420p16(const uchar *src, int pitch, int x, int y, ushort mask)
+{
+    unsigned tmp = (unsigned)conv_8to16(src[ y      * pitch +  x], mask_16bit) +
+                   (unsigned)conv_8to16(src[(y + 1) * pitch +  x], mask_16bit) +
+                   (unsigned)conv_8to16(src[ y      * pitch + (x + 1)], mask_16bit) +
+                   (unsigned)conv_8to16(src[(y + 1) * pitch + (x + 1)], mask_16bit);
+    return (tmp / 4) & mask;
+}
+
+template<typename T>
+static inline __device__ T conv_420to444(const T *src, int width, int height, int pitch, int x, int y)
+{
+    int x1 = x / 2;
+    int y1 = y / 2;
+    int x2 = min(x1 + 1, width - 1);
+    int y2 = min(y1 + 1, height - 1);
+
+    intT tmp;
+    vec_set_scalar(tmp, 0);
+    tmp += to_intN<T, intT>(src[y1 * pitch + x1]);
+    tmp += to_intN<T, intT>(src[y1 * pitch + x2]);
+    tmp += to_intN<T, intT>(src[y2 * pitch + x1]);
+    tmp += to_intN<T, intT>(src[y2 * pitch + x2]);
+
+    return from_intN<T, intT>(tmp / 4);
+}
+
+template<typename T, typename O>
+static inline __device__ O conv_420to444p16(const T *src, int width, int height, int pitch, int x, int y, ushort mask)
+{
+    int x1 = x / 2;
+    int y1 = y / 2;
+    int x2 = min(x1 + 1, (width / 2) - 1);
+    int y2 = min(y1 + 1, (height / 2) - 1);
+
+    intT tmp;
+    vec_set_scalar(tmp, 0);
+    tmp += to_intN<O, intT>(conv_8to16(src[y1 * pitch + x1], mask_16bit));
+    tmp += to_intN<O, intT>(conv_8to16(src[y1 * pitch + x2], mask_16bit));
+    tmp += to_intN<O, intT>(conv_8to16(src[y2 * pitch + x1], mask_16bit));
+    tmp += to_intN<O, intT>(conv_8to16(src[y2 * pitch + x2], mask_16bit));
+
+    return from_intN<O, intT>((tmp / 4) & mask);
+}
+
+#define FIX_PITCH(name) name ## _pitch /= sizeof(*name)
+
+// yuv420p->X
+extern "C" {
+
+__global__ void Convert_yuv420p_nv12(int width, int height,
+                                     uchar  *dst_y  , int dst_y_pitch , const uchar *src_y, int src_y_pitch,
+                                     uchar2 *dst_uv , int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
+                                     uchar  *unused0, int unused1     , const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = make_uchar2(
+            src_u[y * src_u_pitch + x],
+            src_v[y * src_v_pitch + x]
+        );
+    }
+}
+
+__global__ void Convert_yuv420p_yuv444p(int width, int height,
+                                        uchar *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
+                                        uchar *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
+                                        uchar *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        dst_y[y * dst_y_pitch + x] = src_y[y  * src_y_pitch + x];
+        dst_u[y * dst_u_pitch + x] = conv_420to444(src_u, width, height, src_u_pitch, x, y);
+        dst_v[y * dst_v_pitch + x] = conv_420to444(src_v, width, height, src_v_pitch, x, y);
+    }
+}
+
+__global__ void Convert_yuv420p_p010le(int width, int height,
+                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
+                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
+                                       ushort2 *unuse0, int unused_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_10bit);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = make_ushort2(
+            conv_8to16(src_u[y * src_u_pitch + x], mask_10bit),
+            conv_8to16(src_v[y * src_v_pitch + x], mask_10bit)
+        );
+    }
+}
+
+__global__ void Convert_yuv420p_p016le(int width, int height,
+                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
+                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
+                                       ushort2 *unuse0, int unused_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = make_ushort2(
+            conv_8to16(src_u[y * src_u_pitch + x], mask_16bit),
+            conv_8to16(src_v[y * src_v_pitch + x], mask_16bit)
+        );
+    }
+}
+
+__global__ void Convert_yuv420p_yuv444p16le(int width, int height,
+                                            ushort *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
+                                            ushort *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
+                                            ushort *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+        FIX_PITCH(dst_u);
+        FIX_PITCH(dst_v);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y  * src_y_pitch + x],  mask_16bit);
+        dst_u[y * dst_u_pitch + x] = conv_420to444p16<uchar, ushort>(src_u, width, height, src_u_pitch, x, y, mask_16bit);
+        dst_v[y * dst_v_pitch + x] = conv_420to444p16<uchar, ushort>(src_v, width, height, src_v_pitch, x, y, mask_16bit);
+    }
+}
+
+}
+
+// nv12->X
+extern "C" {
+
+__global__ void Convert_nv12_yuv420p(int width, int height,
+                                     uchar *dst_y, int dst_y_pitch, const uchar  *src_y,  int src_y_pitch,
+                                     uchar *dst_u, int dst_u_pitch, const uchar2 *src_uv, int src_uv_pitch,
+                                     uchar *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(src_uv);
+
+        const uchar2 &uv = src_uv[y * src_uv_pitch + x];
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+__global__ void Convert_nv12_yuv444p(int width, int height,
+                                     uchar *dst_y, int dst_y_pitch, const uchar  *src_y,  int src_y_pitch,
+                                     uchar *dst_u, int dst_u_pitch, const uchar2 *src_uv, int src_uv_pitch,
+                                     uchar *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_uv);
+
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+
+        uchar2 uv = conv_420to444(src_uv, width, height, src_uv_pitch, x, y);
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+__global__ void Convert_nv12_p010le(int width, int height,
+                                    ushort  *dst_y,  int dst_y_pitch,  const uchar  *src_y,  int src_y_pitch,
+                                    ushort2 *dst_uv, int dst_uv_pitch, const uchar2 *src_uv, int src_uv_pitch,
+                                    ushort2 *unuse0, int unused_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_10bit);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(src_uv);
+        FIX_PITCH(dst_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = conv_8to16(src_uv[y * src_uv_pitch + x], mask_10bit);
+    }
+}
+
+__global__ void Convert_nv12_p016le(int width, int height,
+                                    ushort  *dst_y,  int dst_y_pitch,  const uchar  *src_y,  int src_y_pitch,
+                                    ushort2 *dst_uv, int dst_uv_pitch, const uchar2 *src_uv, int src_uv_pitch,
+                                    ushort2 *unuse0, int unused_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(src_uv);
+        FIX_PITCH(dst_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = conv_8to16(src_uv[y * src_uv_pitch + x], mask_16bit);
+    }
+}
+
+__global__ void Convert_nv12_yuv444p16le(int width, int height,
+                                         ushort *dst_y, int dst_y_pitch, const uchar  *src_y,  int src_y_pitch,
+                                         ushort *dst_u, int dst_u_pitch, const uchar2 *src_uv, int src_uv_pitch,
+                                         ushort *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_uv);
+        FIX_PITCH(dst_y);
+        FIX_PITCH(dst_u);
+        FIX_PITCH(dst_v);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
+
+        ushort2 uv = conv_420to444p16<uchar2, ushort2>(src_uv, width, height, src_uv_pitch, x, y, mask_16bit);
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+}
+
+// yuv444p->X
+extern "C" {
+
+__global__ void Convert_yuv444p_yuv420p(int width, int height,
+                                        uchar *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
+                                        uchar *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
+                                        uchar *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+
+            dst_u[y2 * dst_u_pitch + x2] = conv_444to420(src_u, src_u_pitch, x, y);
+            dst_v[y2 * dst_v_pitch + x2] = conv_444to420(src_v, src_v_pitch, x, y);
+        }
+    }
+}
+
+__global__ void Convert_yuv444p_nv12(int width, int height,
+                                     uchar  *dst_y , int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
+                                     uchar2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
+                                     uchar2 *unused, int unused_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(dst_uv);
+
+            dst_uv[y2 * dst_uv_pitch + x2] = make_uchar2(
+                conv_444to420(src_u, src_u_pitch, x, y),
+                conv_444to420(src_v, src_v_pitch, x, y)
+            );
+        }
+    }
+}
+
+__global__ void Convert_yuv444p_p010le(int width, int height,
+                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
+                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
+                                       ushort2 *unused, int unused_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_10bit);
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(dst_uv);
+
+            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
+                conv_444to420p16(src_u, src_u_pitch, x, y, mask_10bit),
+                conv_444to420p16(src_v, src_v_pitch, x, y, mask_10bit)
+            );
+        }
+    }
+}
+
+__global__ void Convert_yuv444p_p016le(int width, int height,
+                                       ushort  *dst_y,  int dst_y_pitch,  const uchar *src_y, int src_y_pitch,
+                                       ushort2 *dst_uv, int dst_uv_pitch, const uchar *src_u, int src_u_pitch,
+                                       ushort2 *unused, int unused_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(dst_uv);
+
+            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
+                conv_444to420p16(src_u, src_u_pitch, x, y, mask_16bit),
+                conv_444to420p16(src_v, src_v_pitch, x, y, mask_16bit)
+            );
+        }
+    }
+}
+
+__global__ void Convert_yuv444p_yuv444p16le(int width, int height,
+                                            ushort *dst_y, int dst_y_pitch, const uchar *src_y, int src_y_pitch,
+                                            ushort *dst_u, int dst_u_pitch, const uchar *src_u, int src_u_pitch,
+                                            ushort *dst_v, int dst_v_pitch, const uchar *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+        FIX_PITCH(dst_u);
+        FIX_PITCH(dst_v);
+
+        dst_y[y * dst_y_pitch + x] = conv_8to16(src_y[y * src_y_pitch + x], mask_16bit);
+        dst_u[y * dst_u_pitch + x] = conv_8to16(src_u[y * src_u_pitch + x], mask_16bit);
+        dst_v[y * dst_v_pitch + x] = conv_8to16(src_v[y * src_v_pitch + x], mask_16bit);
+    }
+}
+
+}
+
+// p010le->X
+extern "C" {
+
+__global__ void Convert_p010le_yuv420p(int width, int height,
+                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
+                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
+                                       uchar *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_10to8(src_y[y * src_y_pitch + x]);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(src_uv);
+
+        uchar2 uv = conv_10to8(src_uv[y * src_uv_pitch + x]);
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+__global__ void Convert_p010le_nv12(int width, int height,
+                                    uchar  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
+                                    uchar2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_10to8(src_y[y * src_y_pitch + x]);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+        FIX_PITCH(src_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = conv_10to8(src_uv[y * src_uv_pitch + x]);
+    }
+}
+
+__global__ void Convert_p010le_yuv444p(int width, int height,
+                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
+                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
+                                       uchar *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+        FIX_PITCH(src_uv);
+
+        dst_y[y * dst_y_pitch + x] = conv_10to8(src_y[y * src_y_pitch + x]);
+
+        uchar2 uv = conv_10to8(conv_420to444(src_uv, width, height, src_uv_pitch, x, y));
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+__global__ void Convert_p010le_p016le(int width, int height,
+                                      ushort  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
+                                      ushort2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_10to16(src_y[y * src_y_pitch + x]);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+        FIX_PITCH(src_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = conv_10to16(src_uv[y * src_uv_pitch + x]);
+    }
+}
+
+__global__ void Convert_p010le_yuv444p16le(int width, int height,
+                                           ushort *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
+                                           ushort *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
+                                           ushort *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+        FIX_PITCH(dst_u);
+        FIX_PITCH(dst_v);
+        FIX_PITCH(src_y);
+        FIX_PITCH(src_uv);
+
+        dst_y[y * dst_y_pitch + x] = conv_10to16(src_y[y * src_y_pitch + x]);
+
+        ushort2 uv = conv_10to16(conv_420to444(src_uv, width, height, src_uv_pitch, x, y));
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+}
+
+// p016le->X
+extern "C" {
+
+__global__ void Convert_p016le_yuv420p(int width, int height,
+                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
+                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
+                                       uchar *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(src_uv);
+
+        uchar2 uv = conv_16to8(src_uv[y * src_uv_pitch + x]);
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+__global__ void Convert_p016le_nv12(int width, int height,
+                                    uchar  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
+                                    uchar2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+        FIX_PITCH(src_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = conv_16to8(src_uv[y * src_uv_pitch + x]);
+    }
+}
+
+__global__ void Convert_p016le_yuv444p(int width, int height,
+                                       uchar *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
+                                       uchar *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
+                                       uchar *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+        FIX_PITCH(src_uv);
+
+        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
+
+        uchar2 uv = conv_16to8(conv_420to444(src_uv, width, height, src_uv_pitch, x, y));
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+__global__ void Convert_p016le_p010le(int width, int height,
+                                      ushort  *dst_y,  int dst_y_pitch,  const ushort  *src_y,  int src_y_pitch,
+                                      ushort2 *dst_uv, int dst_uv_pitch, const ushort2 *src_uv, int src_uv_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_16to10(src_y[y * src_y_pitch + x]);
+    }
+
+    if (x < width / 2 && y < height / 2) {
+        FIX_PITCH(dst_uv);
+        FIX_PITCH(src_uv);
+
+        dst_uv[y * dst_uv_pitch + x] = conv_16to10(src_uv[y * src_uv_pitch + x]);
+    }
+}
+
+__global__ void Convert_p016le_yuv444p16le(int width, int height,
+                                           ushort *dst_y, int dst_y_pitch, const ushort  *src_y,  int src_y_pitch,
+                                           ushort *dst_u, int dst_u_pitch, const ushort2 *src_uv, int src_uv_pitch,
+                                           ushort *dst_v, int dst_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(dst_y);
+        FIX_PITCH(dst_u);
+        FIX_PITCH(dst_v);
+        FIX_PITCH(src_y);
+        FIX_PITCH(src_uv);
+
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+
+        ushort2 uv = conv_420to444(src_uv, width, height, src_uv_pitch, x, y);
+        dst_u[y * dst_u_pitch + x] = uv.x;
+        dst_v[y * dst_v_pitch + x] = uv.y;
+    }
+}
+
+}
+
+// yuv444p16le->X
+extern "C" {
+
+__global__ void Convert_yuv444p16le_yuv420p(int width, int height,
+                                            uchar *dst_y, int dst_y_pitch, const ushort *src_y, int src_y_pitch,
+                                            uchar *dst_u, int dst_u_pitch, const ushort *src_u, int src_u_pitch,
+                                            uchar *dst_v, int dst_v_pitch, const ushort *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+
+        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(src_u);
+            FIX_PITCH(src_v);
+
+            dst_u[y2 * dst_u_pitch + x2] = conv_16to8(conv_444to420(src_u, src_u_pitch, x, y));
+            dst_v[y2 * dst_v_pitch + x2] = conv_16to8(conv_444to420(src_v, src_v_pitch, x, y));
+        }
+    }
+}
+
+__global__ void Convert_yuv444p16le_nv12(int width, int height,
+                                         uchar  *dst_y , int dst_y_pitch,  const ushort *src_y, int src_y_pitch,
+                                         uchar2 *dst_uv, int dst_uv_pitch, const ushort *src_u, int src_u_pitch,
+                                         uchar2 *unused, int unused_pitch, const ushort *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(src_u);
+            FIX_PITCH(src_v);
+            FIX_PITCH(dst_uv);
+
+            dst_uv[y2 * dst_uv_pitch + x2] = make_uchar2(
+                conv_16to8(conv_444to420(src_u, src_u_pitch, x, y)),
+                conv_16to8(conv_444to420(src_v, src_v_pitch, x, y))
+            );
+        }
+    }
+}
+
+__global__ void Convert_yuv444p16le_yuv444p(int width, int height,
+                                            uchar *dst_y, int dst_y_pitch, const ushort *src_y, int src_y_pitch,
+                                            uchar *dst_u, int dst_u_pitch, const ushort *src_u, int src_u_pitch,
+                                            uchar *dst_v, int dst_v_pitch, const ushort *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+        FIX_PITCH(src_u);
+        FIX_PITCH(src_v);
+
+        dst_y[y * dst_y_pitch + x] = conv_16to8(src_y[y * src_y_pitch + x]);
+        dst_u[y * dst_u_pitch + x] = conv_16to8(src_u[y * src_u_pitch + x]);
+        dst_v[y * dst_v_pitch + x] = conv_16to8(src_v[y * src_v_pitch + x]);
+    }
+}
+
+__global__ void Convert_yuv444p16le_p010le(int width, int height,
+                                           ushort  *dst_y,  int dst_y_pitch,  const ushort *src_y, int src_y_pitch,
+                                           ushort2 *dst_uv, int dst_uv_pitch, const ushort *src_u, int src_u_pitch,
+                                           ushort2 *unused, int unused_pitch, const ushort *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+        FIX_PITCH(dst_y);
+        dst_y[y * dst_y_pitch + x] = conv_16to10(src_y[y * src_y_pitch + x]);
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(src_u);
+            FIX_PITCH(src_v);
+            FIX_PITCH(dst_uv);
+
+            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
+                conv_16to10(conv_444to420(src_u, src_u_pitch, x, y)),
+                conv_16to10(conv_444to420(src_v, src_v_pitch, x, y))
+            );
+        }
+    }
+}
+
+__global__ void Convert_yuv444p16le_p016le(int width, int height,
+                                           ushort  *dst_y,  int dst_y_pitch,  const ushort *src_y, int src_y_pitch,
+                                           ushort2 *dst_uv, int dst_uv_pitch, const ushort *src_u, int src_u_pitch,
+                                           ushort2 *unused, int unused_pitch, const ushort *src_v, int src_v_pitch)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        FIX_PITCH(src_y);
+        FIX_PITCH(dst_y);
+        dst_y[y * dst_y_pitch + x] = src_y[y * src_y_pitch + x];
+
+        if ((x & 1) == 0 && (y & 1) == 0) {
+            int x2 = x / 2;
+            int y2 = y / 2;
+            FIX_PITCH(src_u);
+            FIX_PITCH(src_v);
+            FIX_PITCH(dst_uv);
+
+            dst_uv[y2 * dst_uv_pitch + x2] = make_ushort2(
+                conv_444to420(src_u, src_u_pitch, x, y),
+                conv_444to420(src_v, src_v_pitch, x, y)
+            );
+        }
+    }
+}
+
+}