diff mbox series

[FFmpeg-devel] libavfilter/vf_colorrange_cuda: CUDA-accelerated video filter for MPEG and JPEG color range conversions

Message ID PH7PR12MB5831CF40BE529281E931F963D2429@PH7PR12MB5831.namprd12.prod.outlook.com
State New
Headers show
Series [FFmpeg-devel] libavfilter/vf_colorrange_cuda: CUDA-accelerated video filter for MPEG and JPEG color range conversions | expand

Checks

Context Check Description
andriy/commit_msg_x86 warning The first line of the commit message must start with a context terminated by a colon and a space, for example "lavu/opt: " or "doc: ".
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Roman Arzumanyan Sept. 10, 2022, 8:16 a.m. UTC
Hello,

This patch adds video filter which does color range conversion similar to swscale scaling filter.

How to use it:
./ffmpeg \
  -hwaccel cuda -hwaccel_output_format cuda \
  -i /path/to/intput/file.mp4 \
  -vf colorrange_cuda=range=mpeg \
  -c:v h264_nvenc \
  -y /path/to/output/file.mp4

Comments

Timo Rothenpieler Sept. 10, 2022, 1:16 p.m. UTC | #1
On 10.09.2022 10:16, Roman Arzumanyan wrote:
> From 2b15d8a609a12d97b1ba7500c7f8771b336e2fdf Mon Sep 17 00:00:00 2001
> From: Roman Arzumanyan <rarzumanyan@nvidia.com>
> Date: Sat, 10 Sep 2022 11:05:56 +0300
> Subject: [PATCH] libavfilter/vf_colorrange_cuda CUDA-accelerated color range
>  conversion filter

We could also call this colorspace_cuda, since it does overlap with what 
the colorspace software filter does, just not nearly to the same degree 
of feature-completeness.
That's fine in my book though, and if someone cares enough, the other 
features of the colorspace filter can be added over time.

> ---
>  configure                         |   2 +
>  libavfilter/Makefile              |   3 +
>  libavfilter/allfilters.c          |   1 +
>  libavfilter/vf_colorrange_cuda.c  | 432 ++++++++++++++++++++++++++++++
>  libavfilter/vf_colorrange_cuda.cu |  93 +++++++
>  5 files changed, 531 insertions(+)
>  create mode 100644 libavfilter/vf_colorrange_cuda.c
>  create mode 100644 libavfilter/vf_colorrange_cuda.cu
> 
> diff --git a/configure b/configure
> index 9d6457d81b..e5f9738ad1 100755
> --- a/configure
> +++ b/configure
> @@ -3155,6 +3155,8 @@ transpose_npp_filter_deps="ffnvcodec libnpp"
>  overlay_cuda_filter_deps="ffnvcodec"
>  overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>  sharpen_npp_filter_deps="ffnvcodec libnpp"
> +colorrange_cuda_filter_deps="ffnvcodec"
> +colorrange_cuda_filter_deps_any="cuda_nvcc cuda_llvm"

Typically should be sorted in by alphapetical ordering.

>  amf_deps_any="libdl LoadLibrary"
>  nvenc_deps="ffnvcodec"
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 30cc329fb6..784e154d81 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -230,6 +230,9 @@ OBJS-$(CONFIG_COLORMAP_FILTER)               += vf_colormap.o
>  OBJS-$(CONFIG_COLORMATRIX_FILTER)            += vf_colormatrix.o
>  OBJS-$(CONFIG_COLORSPACE_FILTER)             += vf_colorspace.o colorspacedsp.o
>  OBJS-$(CONFIG_COLORTEMPERATURE_FILTER)       += vf_colortemperature.o
> +OBJS-$(CONFIG_COLORRANGE_CUDA_FILTER)        += vf_colorrange_cuda.o \
> +                                                vf_colorrange_cuda.ptx.o \
> +                                                cuda/load_helper.o

Same here on alphabetical ordering, should be between colormatrix and 
colorspace.

>  OBJS-$(CONFIG_CONVOLUTION_FILTER)            += vf_convolution.o
>  OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER)     += vf_convolution_opencl.o opencl.o \
>                                                  opencl/convolution.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 5ebacfde27..5e9cbe57ec 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -213,6 +213,7 @@ extern const AVFilter ff_vf_colormap;
>  extern const AVFilter ff_vf_colormatrix;
>  extern const AVFilter ff_vf_colorspace;
>  extern const AVFilter ff_vf_colortemperature;
> +extern const AVFilter ff_vf_colorrange_cuda;
>  extern const AVFilter ff_vf_convolution;
>  extern const AVFilter ff_vf_convolution_opencl;
>  extern const AVFilter ff_vf_convolve;
> diff --git a/libavfilter/vf_colorrange_cuda.c b/libavfilter/vf_colorrange_cuda.c
> new file mode 100644
> index 0000000000..949e7d3bbf
> --- /dev/null
> +++ b/libavfilter/vf_colorrange_cuda.c
> @@ -0,0 +1,432 @@
> +/*
> + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/avstring.h"
> +#include "libavutil/common.h"
> +#include "libavutil/cuda_check.h"
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "scale_eval.h"
> +#include "video.h"
> +
> +#include "cuda/load_helper.h"
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_NV12,
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_YUV444P,
> +};
> +
> +#define DIV_UP(a, b) (((a) + (b)-1) / (b))
> +#define BLOCKX 32
> +#define BLOCKY 16
> +
> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
> +
> +typedef struct CUDAConvContext {
> +    const AVClass* class;
> +
> +    AVCUDADeviceContext* hwctx;
> +    AVBufferRef* frames_ctx;
> +    AVFrame* own_frame;
> +    AVFrame* tmp_frame;
> +
> +    CUcontext cu_ctx;
> +    CUstream cu_stream;
> +    CUmodule cu_module;
> +    CUfunction cu_convert[AVCOL_RANGE_NB];
> +
> +    enum AVPixelFormat pix_fmt;
> +    enum AVColorRange range;
> +
> +    int num_planes;
> +} CUDAConvContext;
> +
> +static av_cold int cudaconv_init(AVFilterContext* ctx)
> +{
> +    CUDAConvContext* s = ctx->priv;
> +
> +    s->own_frame = av_frame_alloc();
> +    if (!s->own_frame)
> +        return AVERROR(ENOMEM);
> +
> +    s->tmp_frame = av_frame_alloc();
> +    if (!s->tmp_frame)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static av_cold void cudaconv_uninit(AVFilterContext* ctx)
> +{
> +    CUDAConvContext* s = ctx->priv;
> +
> +    if (s->hwctx && s->cu_module) {
> +        CudaFunctions* cu = s->hwctx->internal->cuda_dl;
> +        CUcontext dummy;
> +
> +        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> +        CHECK_CU(cu->cuModuleUnload(s->cu_module));
> +        s->cu_module = NULL;
> +        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    }
> +
> +    av_frame_free(&s->own_frame);
> +    av_buffer_unref(&s->frames_ctx);
> +    av_frame_free(&s->tmp_frame);
> +}
> +
> +static av_cold int init_hwframe_ctx(CUDAConvContext* s, AVBufferRef* device_ctx,
> +                                    int width, int height)
> +{
> +    AVBufferRef* out_ref = NULL;
> +    AVHWFramesContext* out_ctx;
> +    int ret;
> +
> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> +    if (!out_ref)
> +        return AVERROR(ENOMEM);
> +
> +    out_ctx = (AVHWFramesContext*)out_ref->data;
> +
> +    out_ctx->format = AV_PIX_FMT_CUDA;
> +    out_ctx->sw_format = s->pix_fmt;
> +    out_ctx->width = FFALIGN(width, 32);
> +    out_ctx->height = FFALIGN(height, 32);
> +
> +    ret = av_hwframe_ctx_init(out_ref);
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_frame_unref(s->own_frame);
> +    ret = av_hwframe_get_buffer(out_ref, s->own_frame, 0);
> +    if (ret < 0)
> +        goto fail;
> +
> +    s->own_frame->width = width;
> +    s->own_frame->height = height;
> +
> +    av_buffer_unref(&s->frames_ctx);
> +    s->frames_ctx = out_ref;
> +
> +    return 0;
> +fail:
> +    av_buffer_unref(&out_ref);
> +    return ret;
> +}
> +
> +static int format_is_supported(enum AVPixelFormat fmt)
> +{
> +    for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (fmt == supported_formats[i])
> +            return 1;
> +
> +    return 0;
> +}
> +
> +static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
> +                                         int height)
> +{
> +    CUDAConvContext* s = ctx->priv;
> +    AVHWFramesContext* in_frames_ctx;
> +
> +    int ret;
> +
> +    if (!ctx->inputs[0]->hw_frames_ctx) {
> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> +    s->pix_fmt = in_frames_ctx->sw_format;
> +
> +    if (!format_is_supported(s->pix_fmt)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
> +               av_get_pix_fmt_name(s->pix_fmt));
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    s->num_planes = av_pix_fmt_count_planes(s->pix_fmt);
> +
> +    ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
> +    if (ret < 0)
> +        return ret;
> +
> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
> +    if (!ctx->outputs[0]->hw_frames_ctx)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static av_cold int cudaconv_load_functions(AVFilterContext* ctx)
> +{
> +    CUDAConvContext* s = ctx->priv;
> +    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
> +    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
> +    int ret;
> +
> +    extern const unsigned char ff_vf_colorrange_cuda_ptx_data[];
> +    extern const unsigned int ff_vf_colorrange_cuda_ptx_len;
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> +    if (ret < 0)
> +        return ret;
> +
> +    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
> +                              ff_vf_colorrange_cuda_ptx_data,
> +                              ff_vf_colorrange_cuda_ptx_len);
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = CHECK_CU(cu->cuModuleGetFunction(
> +        &s->cu_convert[AVCOL_RANGE_MPEG], s->cu_module,
> +        "to_mpeg_cuda"));
> +
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = CHECK_CU(cu->cuModuleGetFunction(
> +        &s->cu_convert[AVCOL_RANGE_JPEG], s->cu_module,
> +        "to_jpeg_cuda"));
> +
> +    if (ret < 0)
> +        goto fail;
> +
> +fail:
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    return ret;
> +}
> +
> +static av_cold int cudaconv_config_props(AVFilterLink* outlink)
> +{
> +    AVFilterContext* ctx = outlink->src;
> +    AVFilterLink* inlink = outlink->src->inputs[0];
> +    CUDAConvContext* s = ctx->priv;
> +    AVHWFramesContext* frames_ctx =
> +        (AVHWFramesContext*)inlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext* device_hwctx = frames_ctx->device_ctx->hwctx;
> +    int ret;
> +
> +    s->hwctx = device_hwctx;
> +    s->cu_stream = s->hwctx->stream;
> +
> +    outlink->w = inlink->w;
> +    outlink->h = inlink->h;
> +
> +    ret = init_processing_chain(ctx, inlink->w, inlink->h);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (inlink->sample_aspect_ratio.num) {
> +        outlink->sample_aspect_ratio = av_mul_q(
> +            (AVRational){outlink->h * inlink->w, outlink->w * inlink->h},
> +            inlink->sample_aspect_ratio);
> +    } else {
> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> +    }
> +
> +    ret = cudaconv_load_functions(ctx);
> +    if (ret < 0)
> +        return ret;
> +
> +    return ret;
> +}
> +
> +static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
> +{
> +    CUDAConvContext* s = ctx->priv;
> +    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
> +    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
> +    int ret;
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> +    if (ret < 0)
> +        return ret;
> +
> +    out->color_range = s->range;
> +
> +    for (int i = 0; i < s->num_planes; i++) {
> +        int width = in->width, height = in->height, comp_id = (i > 0);
> +
> +        switch (s->pix_fmt) {
> +        case AV_PIX_FMT_YUV444P:
> +            break;
> +        case AV_PIX_FMT_YUV420P:
> +            width = comp_id ? in->width / 2 : in->width;
> +        case AV_PIX_FMT_NV12:
> +            height = comp_id ? in->height / 2 : in->height;
> +            break;
> +        default:
> +            return AVERROR(ENOSYS);
> +        }
> +
> +        if (in->color_range != out->color_range) {
> +            void* args[] = {&in->data[i], &out->data[i], &in->linesize[i],
> +                            &comp_id};
> +            ret = CHECK_CU(cu->cuLaunchKernel(
> +                s->cu_convert[out->color_range], DIV_UP(width, BLOCKX),

What happens if the user specifies a color range that's not mpeg or 
jpeg? Like, UNSPECIFIED, which is even the default.
The AVOption absolutely allows that, and I see no check that verifies a 
kernel for that conversion exists, so this would end up passing a NULL 
Kernel to cuLaunchKernel.

Should be an easy enough check at init time, after loading the kernels. 
No Kernel for the given color range? Error.

> +                DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
> +                args, NULL));
> +        } else {
> +            av_hwframe_transfer_data(out, in, 0);
> +        }
> +    }
> +
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    return ret;
> +}
> +
> +static int cudaconv_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
> +{
> +    CUDAConvContext* s = ctx->priv;
> +    AVFilterLink* outlink = ctx->outputs[0];
> +    AVFrame* src = in;
> +    int ret;
> +
> +    ret = conv_cuda_convert(ctx, s->own_frame, src);
> +    if (ret < 0)
> +        return ret;
> +
> +    src = s->own_frame;
> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    av_frame_move_ref(out, s->own_frame);
> +    av_frame_move_ref(s->own_frame, s->tmp_frame);
> +
> +    s->own_frame->width = outlink->w;
> +    s->own_frame->height = outlink->h;
> +
> +    ret = av_frame_copy_props(out, in);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int cudaconv_filter_frame(AVFilterLink* link, AVFrame* in)
> +{
> +    AVFilterContext* ctx = link->dst;
> +    CUDAConvContext* s = ctx->priv;
> +    AVFilterLink* outlink = ctx->outputs[0];
> +    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
> +
> +    AVFrame* out = NULL;
> +    CUcontext dummy;
> +    int ret = 0;
> +
> +    out = av_frame_alloc();
> +    if (!out) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = cudaconv_conv(ctx, out, in);
> +
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
> +              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
> +              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
> +              INT_MAX);
> +
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +fail:
> +    av_frame_free(&in);
> +    av_frame_free(&out);
> +    return ret;
> +}
> +
> +static AVFrame* cudaconv_get_video_buffer(AVFilterLink* inlink, int w, int h)
> +{
> +    return ff_default_get_video_buffer(inlink, w, h);
> +}

This function can be removed entirely, since ff_default_get_video_buffer 
is what's called by default anyway.

> +#define OFFSET(x) offsetof(CUDAConvContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption options[] = {
> +    {"range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_UNSPECIFIED}, AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_NB - 1, FLAGS, "range"},
> +        {"mpeg", "limited range", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS, "range"},
> +        {"jpeg", "full range",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG}, 0, 0, FLAGS, "range"},
> +    {NULL},
> +};
> +
> +static const AVClass cudaconv_class = {
> +    .class_name = "cudaconv",

All the mentions of cudaconv in this file should be renamed to match the 
filter name. It doesn't overly matter for functionality, but the class 
name does end up in logs, and the function names are purely for neatness.

> +    .item_name = av_default_item_name,
> +    .option = options,
> +    .version = LIBAVUTIL_VERSION_INT,
> +};
> +
> +static const AVFilterPad cudaconv_inputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = cudaconv_filter_frame,
> +        .get_buffer.video = cudaconv_get_video_buffer,
> +    },
> +};
> +
> +static const AVFilterPad cudaconv_outputs[] = {
> +    {
> +        .name = "default",
> +        .type = AVMEDIA_TYPE_VIDEO,
> +        .config_props = cudaconv_config_props,
> +    },
> +};
> +
> +const AVFilter ff_vf_colorrange_cuda = {
> +    .name = "colorrange_cuda",
> +    .description =
> +        NULL_IF_CONFIG_SMALL("CUDA accelerated video color range converter"),
> +
> +    .init = cudaconv_init,
> +    .uninit = cudaconv_uninit,
> +
> +    .priv_size = sizeof(CUDAConvContext),
> +    .priv_class = &cudaconv_class,
> +
> +    FILTER_INPUTS(cudaconv_inputs),
> +    FILTER_OUTPUTS(cudaconv_outputs),
> +
> +    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
> +
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> diff --git a/libavfilter/vf_colorrange_cuda.cu b/libavfilter/vf_colorrange_cuda.cu
> new file mode 100644
> index 0000000000..6f617493f8
> --- /dev/null
> +++ b/libavfilter/vf_colorrange_cuda.cu
> @@ -0,0 +1,93 @@
> +/*
> + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +extern "C" {
> +#define MPEG_LUMA_MIN   (16)
> +#define MPEG_CHROMA_MIN (16)
> +#define MPEG_LUMA_MAX   (235)
> +#define MPEG_CHROMA_MAX (240)
> +
> +#define JPEG_LUMA_MIN   (0)
> +#define JPEG_CHROMA_MIN (1)
> +#define JPEG_LUMA_MAX   (255)
> +#define JPEG_CHROMA_MAX (255)
> +
> +__device__ int mpeg_min[] = {MPEG_LUMA_MIN, MPEG_CHROMA_MIN};
> +__device__ int mpeg_max[] = {MPEG_LUMA_MAX, MPEG_CHROMA_MAX};
> +
> +__device__ int jpeg_min[] = {JPEG_LUMA_MIN, JPEG_CHROMA_MIN};
> +__device__ int jpeg_max[] = {JPEG_LUMA_MAX, JPEG_CHROMA_MAX};
> +
> +__device__ int clamp(int val, int min, int max)
> +{
> +    if (val < min)
> +        return min;
> +    else if (val > max)
> +        return max;
> +    else
> +        return val;
> +}
> +
> +__global__ void to_jpeg_cuda(const unsigned char* src, unsigned char* dst,
> +                             int pitch, int comp_id)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +    int src_, dst_;
> +
> +    // 8 bit -> 15 bit for better precision;
> +    src_ = static_cast<int>(src[x + y * pitch]) << 7;
> +
> +    // Conversion;
> +    dst_ = comp_id ? (min(src_, 30775) * 4663 - 9289992) >> 12    // chroma
> +                   : (min(src_, 30189) * 19077 - 39057361) >> 14; // luma
> +
> +    // Dither replacement;
> +    dst_ = dst_ + 64;
> +
> +    // Back to 8 bit;
> +    dst_ = clamp(dst_ >> 7, jpeg_min[comp_id], jpeg_max[comp_id]);
> +    dst[x + y * pitch] = static_cast<unsigned char>(dst_);
> +}
> +
> +__global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst,
> +                             int pitch, int comp_id)
> +{
> +    int x = blockIdx.x * blockDim.x + threadIdx.x;
> +    int y = blockIdx.y * blockDim.y + threadIdx.y;
> +    int src_, dst_;
> +
> +    // 8 bit -> 15 bit for better precision;
> +    src_ = static_cast<int>(src[x + y * pitch]) << 7;
> +
> +    // Conversion;
> +    dst_ = comp_id ? (src_ * 1799 + 4081085) >> 11    // chroma
> +                   : (src_ * 14071 + 33561947) >> 14; // luma
> +
> +    // Dither replacement;
> +    dst_ = dst_ + 64;
> +
> +    // Back to 8 bit;
> +    dst_ = clamp(dst_ >> 7, mpeg_min[comp_id], mpeg_max[comp_id]);
> +    dst[x + y * pitch] = static_cast<unsigned char>(dst_);
> +}
> +}
> \ No newline at end of file
> -- 
> 2.25.1
> 

Looks good to me on first glance otherwise, will give it a test soon
Roman Arzumanyan Sept. 11, 2022, 7:28 a.m. UTC | #2
Thanks for the detailed review, Timo.

Please find fixed patch in attachement.
Timo Rothenpieler Sept. 13, 2022, 9:05 p.m. UTC | #3
On 11.09.2022 09:28, Roman Arzumanyan wrote:
> Thanks for the detailed review, Timo.
> 
> Please find fixed patch in attachement.

I've renamed the filter to colorspace_cuda, did some super minor 
stylistic fixes and applied the patch.

Works as I would expect. Obviously it's super limited in what it can do, 
as is the intent for now. But that's fine, features the software version 
offers can be added as people need/want them.
diff mbox series

Patch

From 2b15d8a609a12d97b1ba7500c7f8771b336e2fdf Mon Sep 17 00:00:00 2001
From: Roman Arzumanyan <rarzumanyan@nvidia.com>
Date: Sat, 10 Sep 2022 11:05:56 +0300
Subject: [PATCH] libavfilter/vf_colorrange_cuda CUDA-accelerated color range
 conversion filter

---
 configure                         |   2 +
 libavfilter/Makefile              |   3 +
 libavfilter/allfilters.c          |   1 +
 libavfilter/vf_colorrange_cuda.c  | 432 ++++++++++++++++++++++++++++++
 libavfilter/vf_colorrange_cuda.cu |  93 +++++++
 5 files changed, 531 insertions(+)
 create mode 100644 libavfilter/vf_colorrange_cuda.c
 create mode 100644 libavfilter/vf_colorrange_cuda.cu

diff --git a/configure b/configure
index 9d6457d81b..e5f9738ad1 100755
--- a/configure
+++ b/configure
@@ -3155,6 +3155,8 @@  transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 sharpen_npp_filter_deps="ffnvcodec libnpp"
+colorrange_cuda_filter_deps="ffnvcodec"
+colorrange_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 30cc329fb6..784e154d81 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -230,6 +230,9 @@  OBJS-$(CONFIG_COLORMAP_FILTER)               += vf_colormap.o
 OBJS-$(CONFIG_COLORMATRIX_FILTER)            += vf_colormatrix.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += vf_colorspace.o colorspacedsp.o
 OBJS-$(CONFIG_COLORTEMPERATURE_FILTER)       += vf_colortemperature.o
+OBJS-$(CONFIG_COLORRANGE_CUDA_FILTER)        += vf_colorrange_cuda.o \
+                                                vf_colorrange_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_CONVOLUTION_FILTER)            += vf_convolution.o
 OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER)     += vf_convolution_opencl.o opencl.o \
                                                 opencl/convolution.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 5ebacfde27..5e9cbe57ec 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -213,6 +213,7 @@  extern const AVFilter ff_vf_colormap;
 extern const AVFilter ff_vf_colormatrix;
 extern const AVFilter ff_vf_colorspace;
 extern const AVFilter ff_vf_colortemperature;
+extern const AVFilter ff_vf_colorrange_cuda;
 extern const AVFilter ff_vf_convolution;
 extern const AVFilter ff_vf_convolution_opencl;
 extern const AVFilter ff_vf_convolve;
diff --git a/libavfilter/vf_colorrange_cuda.c b/libavfilter/vf_colorrange_cuda.c
new file mode 100644
index 0000000000..949e7d3bbf
--- /dev/null
+++ b/libavfilter/vf_colorrange_cuda.c
@@ -0,0 +1,432 @@ 
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "scale_eval.h"
+#include "video.h"
+
+#include "cuda/load_helper.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV444P,
+};
+
+#define DIV_UP(a, b) (((a) + (b)-1) / (b))
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct CUDAConvContext {
+    const AVClass* class;
+
+    AVCUDADeviceContext* hwctx;
+    AVBufferRef* frames_ctx;
+    AVFrame* own_frame;
+    AVFrame* tmp_frame;
+
+    CUcontext cu_ctx;
+    CUstream cu_stream;
+    CUmodule cu_module;
+    CUfunction cu_convert[AVCOL_RANGE_NB];
+
+    enum AVPixelFormat pix_fmt;
+    enum AVColorRange range;
+
+    int num_planes;
+} CUDAConvContext;
+
+static av_cold int cudaconv_init(AVFilterContext* ctx)
+{
+    CUDAConvContext* s = ctx->priv;
+
+    s->own_frame = av_frame_alloc();
+    if (!s->own_frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void cudaconv_uninit(AVFilterContext* ctx)
+{
+    CUDAConvContext* s = ctx->priv;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_frame_free(&s->own_frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static av_cold int init_hwframe_ctx(CUDAConvContext* s, AVBufferRef* device_ctx,
+                                    int width, int height)
+{
+    AVBufferRef* out_ref = NULL;
+    AVHWFramesContext* out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->pix_fmt;
+    out_ctx->width = FFALIGN(width, 32);
+    out_ctx->height = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->own_frame);
+    ret = av_hwframe_get_buffer(out_ref, s->own_frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->own_frame->width = width;
+    s->own_frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (fmt == supported_formats[i])
+            return 1;
+
+    return 0;
+}
+
+static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
+                                         int height)
+{
+    CUDAConvContext* s = ctx->priv;
+    AVHWFramesContext* in_frames_ctx;
+
+    int ret;
+
+    if (!ctx->inputs[0]->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+
+    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
+    s->pix_fmt = in_frames_ctx->sw_format;
+
+    if (!format_is_supported(s->pix_fmt)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
+               av_get_pix_fmt_name(s->pix_fmt));
+        return AVERROR(ENOSYS);
+    }
+
+    s->num_planes = av_pix_fmt_count_planes(s->pix_fmt);
+
+    ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
+    if (ret < 0)
+        return ret;
+
+    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!ctx->outputs[0]->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int cudaconv_load_functions(AVFilterContext* ctx)
+{
+    CUDAConvContext* s = ctx->priv;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+    int ret;
+
+    extern const unsigned char ff_vf_colorrange_cuda_ptx_data[];
+    extern const unsigned int ff_vf_colorrange_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_colorrange_cuda_ptx_data,
+                              ff_vf_colorrange_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(
+        &s->cu_convert[AVCOL_RANGE_MPEG], s->cu_module,
+        "to_mpeg_cuda"));
+
+    if (ret < 0)
+        goto fail;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(
+        &s->cu_convert[AVCOL_RANGE_JPEG], s->cu_module,
+        "to_jpeg_cuda"));
+
+    if (ret < 0)
+        goto fail;
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static av_cold int cudaconv_config_props(AVFilterLink* outlink)
+{
+    AVFilterContext* ctx = outlink->src;
+    AVFilterLink* inlink = outlink->src->inputs[0];
+    CUDAConvContext* s = ctx->priv;
+    AVHWFramesContext* frames_ctx =
+        (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+    AVCUDADeviceContext* device_hwctx = frames_ctx->device_ctx->hwctx;
+    int ret;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+
+    ret = init_processing_chain(ctx, inlink->w, inlink->h);
+    if (ret < 0)
+        return ret;
+
+    if (inlink->sample_aspect_ratio.num) {
+        outlink->sample_aspect_ratio = av_mul_q(
+            (AVRational){outlink->h * inlink->w, outlink->w * inlink->h},
+            inlink->sample_aspect_ratio);
+    } else {
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    }
+
+    ret = cudaconv_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    return ret;
+}
+
+static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
+{
+    CUDAConvContext* s = ctx->priv;
+    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    int ret;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    out->color_range = s->range;
+
+    for (int i = 0; i < s->num_planes; i++) {
+        int width = in->width, height = in->height, comp_id = (i > 0);
+
+        switch (s->pix_fmt) {
+        case AV_PIX_FMT_YUV444P:
+            break;
+        case AV_PIX_FMT_YUV420P:
+            width = comp_id ? in->width / 2 : in->width;
+        case AV_PIX_FMT_NV12:
+            height = comp_id ? in->height / 2 : in->height;
+            break;
+        default:
+            return AVERROR(ENOSYS);
+        }
+
+        if (in->color_range != out->color_range) {
+            void* args[] = {&in->data[i], &out->data[i], &in->linesize[i],
+                            &comp_id};
+            ret = CHECK_CU(cu->cuLaunchKernel(
+                s->cu_convert[out->color_range], DIV_UP(width, BLOCKX),
+                DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
+                args, NULL));
+        } else {
+            av_hwframe_transfer_data(out, in, 0);
+        }
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+static int cudaconv_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
+{
+    CUDAConvContext* s = ctx->priv;
+    AVFilterLink* outlink = ctx->outputs[0];
+    AVFrame* src = in;
+    int ret;
+
+    ret = conv_cuda_convert(ctx, s->own_frame, src);
+    if (ret < 0)
+        return ret;
+
+    src = s->own_frame;
+    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
+    if (ret < 0)
+        return ret;
+
+    av_frame_move_ref(out, s->own_frame);
+    av_frame_move_ref(s->own_frame, s->tmp_frame);
+
+    s->own_frame->width = outlink->w;
+    s->own_frame->height = outlink->h;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int cudaconv_filter_frame(AVFilterLink* link, AVFrame* in)
+{
+    AVFilterContext* ctx = link->dst;
+    CUDAConvContext* s = ctx->priv;
+    AVFilterLink* outlink = ctx->outputs[0];
+    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame* out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = cudaconv_conv(ctx, out, in);
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
+              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
+              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
+              INT_MAX);
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+static AVFrame* cudaconv_get_video_buffer(AVFilterLink* inlink, int w, int h)
+{
+    return ff_default_get_video_buffer(inlink, w, h);
+}
+
+#define OFFSET(x) offsetof(CUDAConvContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption options[] = {
+    {"range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_UNSPECIFIED}, AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_NB - 1, FLAGS, "range"},
+        {"mpeg", "limited range", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS, "range"},
+        {"jpeg", "full range",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG}, 0, 0, FLAGS, "range"},
+    {NULL},
+};
+
+static const AVClass cudaconv_class = {
+    .class_name = "cudaconv",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad cudaconv_inputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = cudaconv_filter_frame,
+        .get_buffer.video = cudaconv_get_video_buffer,
+    },
+};
+
+static const AVFilterPad cudaconv_outputs[] = {
+    {
+        .name = "default",
+        .type = AVMEDIA_TYPE_VIDEO,
+        .config_props = cudaconv_config_props,
+    },
+};
+
+const AVFilter ff_vf_colorrange_cuda = {
+    .name = "colorrange_cuda",
+    .description =
+        NULL_IF_CONFIG_SMALL("CUDA accelerated video color range converter"),
+
+    .init = cudaconv_init,
+    .uninit = cudaconv_uninit,
+
+    .priv_size = sizeof(CUDAConvContext),
+    .priv_class = &cudaconv_class,
+
+    FILTER_INPUTS(cudaconv_inputs),
+    FILTER_OUTPUTS(cudaconv_outputs),
+
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_colorrange_cuda.cu b/libavfilter/vf_colorrange_cuda.cu
new file mode 100644
index 0000000000..6f617493f8
--- /dev/null
+++ b/libavfilter/vf_colorrange_cuda.cu
@@ -0,0 +1,93 @@ 
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+extern "C" {
+#define MPEG_LUMA_MIN   (16)
+#define MPEG_CHROMA_MIN (16)
+#define MPEG_LUMA_MAX   (235)
+#define MPEG_CHROMA_MAX (240)
+
+#define JPEG_LUMA_MIN   (0)
+#define JPEG_CHROMA_MIN (1)
+#define JPEG_LUMA_MAX   (255)
+#define JPEG_CHROMA_MAX (255)
+
+__device__ int mpeg_min[] = {MPEG_LUMA_MIN, MPEG_CHROMA_MIN};
+__device__ int mpeg_max[] = {MPEG_LUMA_MAX, MPEG_CHROMA_MAX};
+
+__device__ int jpeg_min[] = {JPEG_LUMA_MIN, JPEG_CHROMA_MIN};
+__device__ int jpeg_max[] = {JPEG_LUMA_MAX, JPEG_CHROMA_MAX};
+
+__device__ int clamp(int val, int min, int max)
+{
+    if (val < min)
+        return min;
+    else if (val > max)
+        return max;
+    else
+        return val;
+}
+
+__global__ void to_jpeg_cuda(const unsigned char* src, unsigned char* dst,
+                             int pitch, int comp_id)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int src_, dst_;
+
+    // 8 bit -> 15 bit for better precision;
+    src_ = static_cast<int>(src[x + y * pitch]) << 7;
+
+    // Conversion;
+    dst_ = comp_id ? (min(src_, 30775) * 4663 - 9289992) >> 12    // chroma
+                   : (min(src_, 30189) * 19077 - 39057361) >> 14; // luma
+
+    // Dither replacement;
+    dst_ = dst_ + 64;
+
+    // Back to 8 bit;
+    dst_ = clamp(dst_ >> 7, jpeg_min[comp_id], jpeg_max[comp_id]);
+    dst[x + y * pitch] = static_cast<unsigned char>(dst_);
+}
+
+__global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst,
+                             int pitch, int comp_id)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int src_, dst_;
+
+    // 8 bit -> 15 bit for better precision;
+    src_ = static_cast<int>(src[x + y * pitch]) << 7;
+
+    // Conversion;
+    dst_ = comp_id ? (src_ * 1799 + 4081085) >> 11    // chroma
+                   : (src_ * 14071 + 33561947) >> 14; // luma
+
+    // Dither replacement;
+    dst_ = dst_ + 64;
+
+    // Back to 8 bit;
+    dst_ = clamp(dst_ >> 7, mpeg_min[comp_id], mpeg_max[comp_id]);
+    dst[x + y * pitch] = static_cast<unsigned char>(dst_);
+}
+}
\ No newline at end of file
-- 
2.25.1