diff mbox

[FFmpeg-devel,FFmpeg-devel,v2] Add GPU accelerated video crop filter

Message ID 1553356270-179169-1-git-send-email-nowerzt@gmail.com
State New
Headers show

Commit Message

leozhang March 23, 2019, 3:51 p.m. UTC
Signed-off-by: UsingtcNower <nowerzt@gmail.com>
---
 Changelog                   |   1 +
 configure                   |   1 +
 doc/filters.texi            |  31 +++
 libavfilter/Makefile        |   1 +
 libavfilter/allfilters.c    |   1 +
 libavfilter/version.h       |   2 +-
 libavfilter/vf_crop_cuda.c  | 638 ++++++++++++++++++++++++++++++++++++++++++++
 libavfilter/vf_crop_cuda.cu | 109 ++++++++
 8 files changed, 783 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_crop_cuda.c
 create mode 100644 libavfilter/vf_crop_cuda.cu

Comments

leozhang March 23, 2019, 11:26 p.m. UTC | #1
The corrected version. If there are no other comments or objections, could
this be pushed?

UsingtcNower <nowerzt@gmail.com> 于2019年3月23日周六 下午11:51写道:

> Signed-off-by: UsingtcNower <nowerzt@gmail.com>
> ---
>  Changelog                   |   1 +
>  configure                   |   1 +
>  doc/filters.texi            |  31 +++
>  libavfilter/Makefile        |   1 +
>  libavfilter/allfilters.c    |   1 +
>  libavfilter/version.h       |   2 +-
>  libavfilter/vf_crop_cuda.c  | 638
> ++++++++++++++++++++++++++++++++++++++++++++
>  libavfilter/vf_crop_cuda.cu | 109 ++++++++
>  8 files changed, 783 insertions(+), 1 deletion(-)
>  create mode 100644 libavfilter/vf_crop_cuda.c
>  create mode 100644 libavfilter/vf_crop_cuda.cu
>
> diff --git a/Changelog b/Changelog
> index ad7e82f..f224fc8 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -20,6 +20,7 @@ version <next>:
>  - libaribb24 based ARIB STD-B24 caption support (profiles A and C)
>  - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec
>  - removed libndi-newtek
> +- crop_cuda GPU accelerated video crop filter
>
>
>  version 4.1:
> diff --git a/configure b/configure
> index 331393f..3f3ac2f 100755
> --- a/configure
> +++ b/configure
> @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"
>  vaapi_encode_deps="vaapi"
>  v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>
> +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"
>  hwupload_cuda_filter_deps="ffnvcodec"
>  scale_npp_filter_deps="ffnvcodec libnpp"
>  scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 4ffb392..ee16a2d 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it is
> kept at its current
>  value.
>  @end table
>
> +@section crop_cuda
> +
> +Crop the input video to given dimensions, implemented in CUDA.
> +
> +It accepts the following parameters:
> +
> +@table @option
> +
> +@item w
> +The width of the output video. It defaults to @code{iw}.
> +This expression is evaluated only once during the filter
> +configuration.
> +
> +@item h
> +The height of the output video. It defaults to @code{ih}.
> +This expression is evaluated only once during the filter
> +configuration.
> +
> +@item x
> +The horizontal position, in the input video, of the left edge of the
> output
> +video. It defaults to @code{(in_w-out_w)/2}.
> +This expression is evaluated only once during the filter
> +configuration.
> +
> +@item y
> +The vertical position, in the input video, of the top edge of the output
> video.
> +It defaults to @code{(in_h-out_h)/2}.
> +This expression is evaluated only once during the filter
> +configuration.
> +@end table
> +
>  @section cropdetect
>
>  Auto-detect the crop size.
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index fef6ec5..84df037 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   +=
> vf_copy.o
>  OBJS-$(CONFIG_COREIMAGE_FILTER)              += vf_coreimage.o
>  OBJS-$(CONFIG_COVER_RECT_FILTER)             += vf_cover_rect.o
> lavfutils.o
>  OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o
> +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o
> vf_crop_cuda.ptx.o
>  OBJS-$(CONFIG_CROPDETECT_FILTER)             += vf_cropdetect.o
>  OBJS-$(CONFIG_CUE_FILTER)                    += f_cue.o
>  OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index c51ae0f..550e545 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;
>  extern AVFilter ff_vf_coreimage;
>  extern AVFilter ff_vf_cover_rect;
>  extern AVFilter ff_vf_crop;
> +extern AVFilter ff_vf_crop_cuda;
>  extern AVFilter ff_vf_cropdetect;
>  extern AVFilter ff_vf_cue;
>  extern AVFilter ff_vf_curves;
> diff --git a/libavfilter/version.h b/libavfilter/version.h
> index c71282c..5aa95f4 100644
> --- a/libavfilter/version.h
> +++ b/libavfilter/version.h
> @@ -31,7 +31,7 @@
>
>  #define LIBAVFILTER_VERSION_MAJOR   7
>  #define LIBAVFILTER_VERSION_MINOR  48
> -#define LIBAVFILTER_VERSION_MICRO 100
> +#define LIBAVFILTER_VERSION_MICRO 101
>
>  #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR,
> \
>                                                 LIBAVFILTER_VERSION_MINOR,
> \
> diff --git a/libavfilter/vf_crop_cuda.c b/libavfilter/vf_crop_cuda.c
> new file mode 100644
> index 0000000..fc6a2a6
> --- /dev/null
> +++ b/libavfilter/vf_crop_cuda.c
> @@ -0,0 +1,638 @@
> +/*
> +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> +*
> +* Permission is hereby granted, free of charge, to any person obtaining a
> +* copy of this software and associated documentation files (the
> "Software"),
> +* to deal in the Software without restriction, including without
> limitation
> +* the rights to use, copy, modify, merge, publish, distribute, sublicense,
> +* and/or sell copies of the Software, and to permit persons to whom the
> +* Software is furnished to do so, subject to the following conditions:
> +*
> +* The above copyright notice and this permission notice shall be included
> in
> +* all copies or substantial portions of the Software.
> +*
> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
> OR
> +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> +* DEALINGS IN THE SOFTWARE.
> +*/
> +
> +#include <stdio.h>
> +#include <string.h>
> +
> +#include "libavutil/avstring.h"
> +#include "libavutil/common.h"
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#include "libavutil/cuda_check.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/eval.h"
> +
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +static const char *const var_names[] = {
> +    "in_w", "iw",   ///< width  of the input video
> +    "in_h", "ih",   ///< height of the input video
> +    "out_w", "ow",  ///< width  of the cropped video
> +    "out_h", "oh",  ///< height of the cropped video
> +    "x",
> +    "y",
> +    NULL
> +};
> +
> +enum var_name {
> +    VAR_IN_W,  VAR_IW,
> +    VAR_IN_H,  VAR_IH,
> +    VAR_OUT_W, VAR_OW,
> +    VAR_OUT_H, VAR_OH,
> +    VAR_X,
> +    VAR_Y,
> +    VAR_VARS_NB
> +};
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_NV12,
> +    AV_PIX_FMT_YUV444P,
> +    AV_PIX_FMT_P010,
> +    AV_PIX_FMT_P016
> +};
> +
> +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
> +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))
> +#define NUM_BUFFERS 2
> +#define BLOCKX 32
> +#define BLOCKY 16
> +
> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
> +
> +typedef struct CUDACropContext {
> +    const AVClass *class;
> +    AVCUDADeviceContext *hwctx;
> +    enum AVPixelFormat in_fmt;
> +    enum AVPixelFormat out_fmt;
> +
> +    struct {
> +        int width;
> +        int height;
> +        int left;
> +        int top;
> +    } planes_in[3], planes_out[3];
> +
> +    AVBufferRef *frames_ctx;
> +    AVFrame     *frame;
> +
> +    AVFrame *tmp_frame;
> +    int passthrough;
> +
> +    /**
> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
> +     */
> +    enum AVPixelFormat format;
> +
> +    int w,h,x,y;
> +    char *w_expr, *h_expr, *x_expr, *y_expr;
> +    double var_values[VAR_VARS_NB];
> +
> +    CUcontext   cu_ctx;
> +    CUmodule    cu_module;
> +    CUfunction  cu_func_uchar;
> +    CUfunction  cu_func_uchar2;
> +    CUfunction  cu_func_uchar4;
> +    CUfunction  cu_func_ushort;
> +    CUfunction  cu_func_ushort2;
> +    CUfunction  cu_func_ushort4;
> +    CUstream    cu_stream;
> +
> +    CUdeviceptr srcBuffer;
> +    CUdeviceptr dstBuffer;
> +    int         tex_alignment;
> +} CUDACropContext;
> +
> +static av_cold int cudacrop_init(AVFilterContext *ctx)
> +{
> +    CUDACropContext *s = ctx->priv;
> +
> +    s->format = AV_PIX_FMT_NONE;
> +    s->frame = av_frame_alloc();
> +    if (!s->frame)
> +        return AVERROR(ENOMEM);
> +
> +    s->tmp_frame = av_frame_alloc();
> +    if (!s->tmp_frame)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static av_cold void cudacrop_uninit(AVFilterContext *ctx)
> +{
> +    CUDACropContext *s = ctx->priv;
> +
> +    av_frame_free(&s->frame);
> +    av_buffer_unref(&s->frames_ctx);
> +    av_frame_free(&s->tmp_frame);
> +}
> +
> +static int cudacrop_query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pixel_formats[] = {
> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
> +    };
> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
> +
> +    return ff_set_common_formats(ctx, pix_fmts);
> +}
> +
> +static av_cold int init_stage(CUDACropContext *s, AVBufferRef *device_ctx)
> +{
> +    AVBufferRef *out_ref = NULL;
> +    AVHWFramesContext *out_ctx;
> +    int in_sw, in_sh, out_sw, out_sh;
> +    int ret, i;
> +
> +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);
> +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);
> +    if (!s->planes_out[0].width) {
> +        s->planes_out[0].width  = s->planes_in[0].width;
> +        s->planes_out[0].height = s->planes_in[0].height;
> +        s->planes_out[0].left = s->planes_in[0].left;
> +        s->planes_out[0].top  = s->planes_in[0].top;
> +    }
> +
> +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {
> +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;
> +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;
> +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;
> +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;
> +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;
> +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;
> +        s->planes_out[i].left   = 0;
> +        s->planes_out[i].top    = 0;
> +
> +    }
> +
> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> +    if (!out_ref)
> +        return AVERROR(ENOMEM);
> +    out_ctx = (AVHWFramesContext*)out_ref->data;
> +
> +    out_ctx->format    = AV_PIX_FMT_CUDA;
> +    out_ctx->sw_format = s->out_fmt;
> +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);
> +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);
> +
> +    ret = av_hwframe_ctx_init(out_ref);
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_frame_unref(s->frame);
> +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
> +    if (ret < 0)
> +        goto fail;
> +
> +    s->frame->width  = s->planes_out[0].width;
> +    s->frame->height = s->planes_out[0].height;
> +
> +    av_buffer_unref(&s->frames_ctx);
> +    s->frames_ctx = out_ref;
> +
> +    return 0;
> +fail:
> +    av_buffer_unref(&out_ref);
> +    return ret;
> +}
> +
> +static int format_is_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +static av_cold int init_processing_chain(AVFilterContext *ctx, int
> in_width, int in_height,
> +                                         int out_width, int out_height,
> +                                         int left, int top)
> +{
> +    CUDACropContext *s = ctx->priv;
> +
> +    AVHWFramesContext *in_frames_ctx;
> +
> +    enum AVPixelFormat in_format;
> +    enum AVPixelFormat out_format;
> +    int ret;
> +
> +    /* check that we have a hw context */
> +    if (!ctx->inputs[0]->hw_frames_ctx) {
> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
> +        return AVERROR(EINVAL);
> +    }
> +    in_frames_ctx =
> (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> +    in_format     = in_frames_ctx->sw_format;
> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format :
> s->format;
> +
> +    if (!format_is_supported(in_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
> +               av_get_pix_fmt_name(in_format));
> +        return AVERROR(ENOSYS);
> +    }
> +    if (!format_is_supported(out_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
> +               av_get_pix_fmt_name(out_format));
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    if (in_width == out_width && in_height == out_height)
> +        s->passthrough = 1;
> +
> +    s->in_fmt = in_format;
> +    s->out_fmt = out_format;
> +
> +    s->planes_in[0].width   = in_width;
> +    s->planes_in[0].height  = in_height;
> +    s->planes_in[0].left = left;
> +    s->planes_in[0].top = top;
> +    s->planes_out[0].width  = out_width;
> +    s->planes_out[0].height = out_height;
> +    s->planes_out[0].left = 0;
> +    s->planes_out[0].top = 0;
> +
> +    ret = init_stage(s, in_frames_ctx->device_ref);
> +    if (ret < 0)
> +        return ret;
> +
> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
> +    if (!ctx->outputs[0]->hw_frames_ctx)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static inline int normalize_double(int *n, double d)
> +{
> +    int ret = 0;
> +
> +    if (isnan(d))
> +        ret = AVERROR(EINVAL);
> +    else if (d > INT_MAX || d < INT_MIN) {
> +        *n = d > INT_MAX ? INT_MAX : INT_MIN;
> +        ret = AVERROR(EINVAL);
> +    } else
> +        *n = lrint(d);
> +
> +    return ret;
> +}
> +
> +static av_cold int cudacrop_config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    CUDACropContext *s = ctx->priv;
> +    double res;
> +    int ret;
> +
> +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;
> +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;
> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;
> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;
> +    s->var_values[VAR_X] = NAN;
> +    s->var_values[VAR_Y] = NAN;
> +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;
> +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;
> +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_X] = res;
> +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_Y] = res;
> +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||
> +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||
> +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||
> +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {
> +        av_log(ctx, AV_LOG_ERROR,
> +               "Too big value or invalid expression for out_w/ow or
> out_h/oh or x or y");
> +        return AVERROR(EINVAL);
> +    }
> +
> +fail:
> +    return ret;
> +}
> +
> +static av_cold int cudacrop_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *ctx = outlink->src;
> +    AVFilterLink *inlink = outlink->src->inputs[0];
> +    CUDACropContext *s  = ctx->priv;
> +    AVHWFramesContext     *frames_ctx =
> (AVHWFramesContext*)inlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
> +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
> +    int ret;
> +
> +    extern char vf_crop_cuda_ptx[];
> +
> +    s->hwctx = device_hwctx;
> +    s->cu_stream = s->hwctx->stream;
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_crop_cuda_ptx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module,
> "Crop_uchar"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module,
> "Crop_uchar2"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module,
> "Crop_uchar4"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module,
> "Crop_ushort"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module,
> "Crop_ushort2"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module,
> "Crop_ushort4"));
> +    if (ret < 0)
> +        goto fail;
> +
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +
> +    outlink->w = s->w;
> +    outlink->h = s->h;
> +
> +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w, s->h,
> s->x, s->y);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (inlink->sample_aspect_ratio.num) {
> +        outlink->sample_aspect_ratio =
> av_mul_q((AVRational){outlink->h*inlink->w,
> +
>  outlink->w*inlink->h},
> +
> inlink->sample_aspect_ratio);
> +    } else {
> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> +    }
> +
> +    return 0;
> +
> +fail:
> +    return ret;
> +}
> +
> +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func, int
> channels,
> +                              uint8_t *src_dptr, int src_width, int
> src_height, int src_pitch,
> +                              uint8_t *dst_dptr, int dst_width, int
> dst_height, int dst_pitch,
> +                              int left, int top, int pixel_size)
> +{
> +    CUDACropContext *s = ctx->priv;
> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
> +    CUtexObject tex = 0;
> +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height,
> &dst_pitch, &left, &top };
> +    int ret;
> +
> +    CUDA_TEXTURE_DESC tex_desc = {
> +        .filterMode = CU_TR_FILTER_MODE_LINEAR,
> +        .flags = CU_TRSF_READ_AS_INTEGER,
> +    };
> +
> +    CUDA_RESOURCE_DESC res_desc = {
> +        .resType = CU_RESOURCE_TYPE_PITCH2D,
> +        .res.pitch2D.format = pixel_size == 1 ?
> +                              CU_AD_FORMAT_UNSIGNED_INT8 :
> +                              CU_AD_FORMAT_UNSIGNED_INT16,
> +        .res.pitch2D.numChannels = channels,
> +        .res.pitch2D.width = src_width,
> +        .res.pitch2D.height = src_height,
> +        .res.pitch2D.pitchInBytes = src_pitch,
> +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
> +    };
> +
> +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,
> NULL));
> +    if (ret < 0)
> +        goto exit;
> +
> +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX),
> DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
> args_uchar, NULL));
> +
> +exit:
> +    if (tex)
> +        CHECK_CU(cu->cuTexObjectDestroy(tex));
> +    return ret;
> +}
> +
> +static int cropcuda_crop_internal(AVFilterContext *ctx,
> +                            AVFrame *out, AVFrame *in)
> +{
> +    AVHWFramesContext *in_frames_ctx =
> (AVHWFramesContext*)in->hw_frames_ctx->data;
> +    CUDACropContext *s = ctx->priv;
> +
> +    switch (in_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_YUV420P:
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0],
> +                           out->data[0], out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[0].left, s->planes_in[0].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+in->linesize[0]*in->height,
> in->width/2, in->height/2, in->linesize[0]/2,
> +                           out->data[0]+out->linesize[0]*out->height,
> out->width/2, out->height/2, out->linesize[0]/2,
> +                           s->planes_in[1].left, s->planes_in[1].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+
> ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2,
> in->height/2, in->linesize[0]/2,
> +
>  out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,
> out->height/2, out->linesize[0]/2,
> +                           s->planes_in[2].left, s->planes_in[2].top, 1);
> +        break;
> +    case AV_PIX_FMT_YUV444P:
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0],
> +                           out->data[0], out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[0].left, s->planes_in[0].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+in->linesize[0]*in->height,
> in->width, in->height, in->linesize[0],
> +                           out->data[0]+out->linesize[0]*out->height,
> out->width, out->height, out->linesize[0],
> +                           s->planes_in[1].left, s->planes_in[1].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+in->linesize[0]*in->height*2,
> in->width, in->height, in->linesize[0],
> +                           out->data[0]+out->linesize[0]*out->height*2,
> out->width, out->height, out->linesize[0],
> +                           s->planes_in[2].left, s->planes_in[2].top, 1);
> +        break;
> +    case AV_PIX_FMT_NV12:
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0],
> +                           out->data[0], out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[0].left, s->planes_in[0].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,
> +                           in->data[1], in->width/2, in->height/2,
> in->linesize[1],
> +                           out->data[0] + out->linesize[0] *
> ((out->height + 31) & ~0x1f), out->width/2, out->height/2,
> out->linesize[1]/2,
> +                           s->planes_in[1].left, s->planes_in[1].top, 1);
> +        break;
> +    case AV_PIX_FMT_P010LE:
> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0]/2,
> +                           out->data[0], out->width, out->height,
> out->linesize[0]/2,
> +                           s->planes_in[0].left, s->planes_in[0].top, 2);
> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> +                           in->data[1], in->width / 2, in->height / 2,
> in->linesize[1]/2,
> +                           out->data[0] + out->linesize[0] *
> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> out->linesize[1] / 4,
> +                           s->planes_in[1].left, s->planes_in[1].top, 2);
> +        break;
> +    case AV_PIX_FMT_P016LE:
> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0] / 2,
> +                           out->data[0], out->width, out->height,
> out->linesize[0] / 2,
> +                           s->planes_in[0].left, s->planes_in[0].top, 2);
> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> +                           in->data[1], in->width / 2, in->height / 2,
> in->linesize[1] / 2,
> +                           out->data[0] + out->linesize[0] *
> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> out->linesize[1] / 4,
> +                           s->planes_in[1].left, s->planes_in[1].top, 2);
> +        break;
> +    default:
> +        return AVERROR_BUG;
> +    }
> +
> +    return 0;
> +}
> +
> +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
> +{
> +    CUDACropContext *s = ctx->priv;
> +    AVFrame *src = in;
> +    int ret;
> +
> +    ret = cropcuda_crop_internal(ctx, s->frame, src);
> +    if (ret < 0)
> +        return ret;
> +
> +    src = s->frame;
> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    av_frame_move_ref(out, s->frame);
> +    av_frame_move_ref(s->frame, s->tmp_frame);
> +
> +    ret = av_frame_copy_props(out, in);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext              *ctx = link->dst;
> +    CUDACropContext              *s = ctx->priv;
> +    AVFilterLink             *outlink = ctx->outputs[0];
> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> +
> +    AVFrame *out = NULL;
> +    CUcontext dummy;
> +    int ret = 0;
> +
> +    out = av_frame_alloc();
> +    if (!out) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = cudacrop_crop(ctx, out, in);
> +
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_reduce(&out->sample_aspect_ratio.num,
> &out->sample_aspect_ratio.den,
> +              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
> +              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
> +              INT_MAX);
> +
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +fail:
> +    av_frame_free(&in);
> +    av_frame_free(&out);
> +    return ret;
> +}
> +
> +#define OFFSET(x) offsetof(CUDACropContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption options[] = {
> +    { "w",      "set the width crop area expression",  OFFSET(w_expr),
>  AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags = FLAGS },
> +    { "h",      "set the height crop area expression", OFFSET(h_expr),
>  AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags = FLAGS },
> +    { "x",      "set the x crop area expression",      OFFSET(x_expr),
>  AV_OPT_TYPE_STRING, { .str = "(in_w-out_w)/2"}, .flags = FLAGS },
> +    { "y",      "set the y crop area expression",      OFFSET(y_expr),
>  AV_OPT_TYPE_STRING, { .str = "(in_h-out_h)/2"}, .flags = FLAGS },
> +    { NULL },
> +};
> +
> +static const AVClass cudacrop_class = {
> +    .class_name = "cudacrop",
> +    .item_name  = av_default_item_name,
> +    .option     = options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +static const AVFilterPad cudacrop_inputs[] = {
> +    {
> +        .name        = "default",
> +        .type        = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = cudacrop_filter_frame,
> +        .config_props = cudacrop_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad cudacrop_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = cudacrop_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_crop_cuda = {
> +    .name      = "crop_cuda",
> +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video crop"),
> +
> +    .init          = cudacrop_init,
> +    .uninit        = cudacrop_uninit,
> +    .query_formats = cudacrop_query_formats,
> +
> +    .priv_size = sizeof(CUDACropContext),
> +    .priv_class = &cudacrop_class,
> +
> +    .inputs    = cudacrop_inputs,
> +    .outputs   = cudacrop_outputs,
> +
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu
> new file mode 100644
> index 0000000..4b94b73
> --- /dev/null
> +++ b/libavfilter/vf_crop_cuda.cu
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +extern "C" {
> +
> +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,
> +                           unsigned char *dst,
> +                           int dst_width, int dst_height, int dst_pitch,
> +                           int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned
> char>(uchar_tex, xi, yi);
> +}
> +
> +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,
> +                            uchar2 *dst,
> +                            int dst_width, int dst_height, int dst_pitch,
> +                            int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex, xi, yi);
> +}
> +
> +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,
> +                            uchar4 *dst,
> +                            int dst_width, int dst_height, int dst_pitch,
> +                            int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex, xi, yi);
> +}
> +
> +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,
> +                            unsigned short *dst,
> +                            int dst_width, int dst_height, int dst_pitch,
> +                            int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned
> short>(ushort_tex, xi, yi);
> +}
> +
> +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,
> +                             ushort2 *dst,
> +                             int dst_width, int dst_height, int dst_pitch,
> +                             int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex, xi,
> yi);
> +}
> +
> +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,
> +                             ushort4 *dst,
> +                             int dst_width, int dst_height, int dst_pitch,
> +                             int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex, xi,
> yi);
> +}
> +
> +}
> --
> 1.8.3.1
>
>
Liu Steven March 24, 2019, 12:59 a.m. UTC | #2
> 在 2019年3月24日,07:26,Tao Zhang <nowerzt@gmail.com> 写道:
> 
> The corrected version. If there are no other comments or objections, could
> this be pushed?
Of course, maybe this need waiting for other reviewer, about 24 hours after maybe better than push it now.
Because other reviewer maybe busy or not online weekend.

> 
> UsingtcNower <nowerzt@gmail.com> 于2019年3月23日周六 下午11:51写道:
> 
>> Signed-off-by: UsingtcNower <nowerzt@gmail.com>
>> ---
>> Changelog                   |   1 +
>> configure                   |   1 +
>> doc/filters.texi            |  31 +++
>> libavfilter/Makefile        |   1 +
>> libavfilter/allfilters.c    |   1 +
>> libavfilter/version.h       |   2 +-
>> libavfilter/vf_crop_cuda.c  | 638
>> ++++++++++++++++++++++++++++++++++++++++++++
>> libavfilter/vf_crop_cuda.cu | 109 ++++++++
>> 8 files changed, 783 insertions(+), 1 deletion(-)
>> create mode 100644 libavfilter/vf_crop_cuda.c
>> create mode 100644 libavfilter/vf_crop_cuda.cu
>> 
>> diff --git a/Changelog b/Changelog
>> index ad7e82f..f224fc8 100644
>> --- a/Changelog
>> +++ b/Changelog
>> @@ -20,6 +20,7 @@ version <next>:
>> - libaribb24 based ARIB STD-B24 caption support (profiles A and C)
>> - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec
>> - removed libndi-newtek
>> +- crop_cuda GPU accelerated video crop filter
>> 
>> 
>> version 4.1:
>> diff --git a/configure b/configure
>> index 331393f..3f3ac2f 100755
>> --- a/configure
>> +++ b/configure
>> @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"
>> vaapi_encode_deps="vaapi"
>> v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>> 
>> +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"
>> hwupload_cuda_filter_deps="ffnvcodec"
>> scale_npp_filter_deps="ffnvcodec libnpp"
>> scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
>> diff --git a/doc/filters.texi b/doc/filters.texi
>> index 4ffb392..ee16a2d 100644
>> --- a/doc/filters.texi
>> +++ b/doc/filters.texi
>> @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it is
>> kept at its current
>> value.
>> @end table
>> 
>> +@section crop_cuda
>> +
>> +Crop the input video to given dimensions, implemented in CUDA.
>> +
>> +It accepts the following parameters:
>> +
>> +@table @option
>> +
>> +@item w
>> +The width of the output video. It defaults to @code{iw}.
>> +This expression is evaluated only once during the filter
>> +configuration.
>> +
>> +@item h
>> +The height of the output video. It defaults to @code{ih}.
>> +This expression is evaluated only once during the filter
>> +configuration.
>> +
>> +@item x
>> +The horizontal position, in the input video, of the left edge of the
>> output
>> +video. It defaults to @code{(in_w-out_w)/2}.
>> +This expression is evaluated only once during the filter
>> +configuration.
>> +
>> +@item y
>> +The vertical position, in the input video, of the top edge of the output
>> video.
>> +It defaults to @code{(in_h-out_h)/2}.
>> +This expression is evaluated only once during the filter
>> +configuration.
>> +@end table
>> +
>> @section cropdetect
>> 
>> Auto-detect the crop size.
>> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
>> index fef6ec5..84df037 100644
>> --- a/libavfilter/Makefile
>> +++ b/libavfilter/Makefile
>> @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   +=
>> vf_copy.o
>> OBJS-$(CONFIG_COREIMAGE_FILTER)              += vf_coreimage.o
>> OBJS-$(CONFIG_COVER_RECT_FILTER)             += vf_cover_rect.o
>> lavfutils.o
>> OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o
>> +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o
>> vf_crop_cuda.ptx.o
>> OBJS-$(CONFIG_CROPDETECT_FILTER)             += vf_cropdetect.o
>> OBJS-$(CONFIG_CUE_FILTER)                    += f_cue.o
>> OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o
>> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
>> index c51ae0f..550e545 100644
>> --- a/libavfilter/allfilters.c
>> +++ b/libavfilter/allfilters.c
>> @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;
>> extern AVFilter ff_vf_coreimage;
>> extern AVFilter ff_vf_cover_rect;
>> extern AVFilter ff_vf_crop;
>> +extern AVFilter ff_vf_crop_cuda;
>> extern AVFilter ff_vf_cropdetect;
>> extern AVFilter ff_vf_cue;
>> extern AVFilter ff_vf_curves;
>> diff --git a/libavfilter/version.h b/libavfilter/version.h
>> index c71282c..5aa95f4 100644
>> --- a/libavfilter/version.h
>> +++ b/libavfilter/version.h
>> @@ -31,7 +31,7 @@
>> 
>> #define LIBAVFILTER_VERSION_MAJOR   7
>> #define LIBAVFILTER_VERSION_MINOR  48
>> -#define LIBAVFILTER_VERSION_MICRO 100
>> +#define LIBAVFILTER_VERSION_MICRO 101
>> 
>> #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR,
>> \
>>                                                LIBAVFILTER_VERSION_MINOR,
>> \
>> diff --git a/libavfilter/vf_crop_cuda.c b/libavfilter/vf_crop_cuda.c
>> new file mode 100644
>> index 0000000..fc6a2a6
>> --- /dev/null
>> +++ b/libavfilter/vf_crop_cuda.c
>> @@ -0,0 +1,638 @@
>> +/*
>> +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
>> +*
>> +* Permission is hereby granted, free of charge, to any person obtaining a
>> +* copy of this software and associated documentation files (the
>> "Software"),
>> +* to deal in the Software without restriction, including without
>> limitation
>> +* the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> +* and/or sell copies of the Software, and to permit persons to whom the
>> +* Software is furnished to do so, subject to the following conditions:
>> +*
>> +* The above copyright notice and this permission notice shall be included
>> in
>> +* all copies or substantial portions of the Software.
>> +*
>> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
>> OR
>> +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> OTHER
>> +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> +* DEALINGS IN THE SOFTWARE.
>> +*/
>> +
>> +#include <stdio.h>
>> +#include <string.h>
>> +
>> +#include "libavutil/avstring.h"
>> +#include "libavutil/common.h"
>> +#include "libavutil/hwcontext.h"
>> +#include "libavutil/hwcontext_cuda_internal.h"
>> +#include "libavutil/cuda_check.h"
>> +#include "libavutil/internal.h"
>> +#include "libavutil/opt.h"
>> +#include "libavutil/pixdesc.h"
>> +#include "libavutil/eval.h"
>> +
>> +#include "avfilter.h"
>> +#include "formats.h"
>> +#include "internal.h"
>> +#include "video.h"
>> +
>> +static const char *const var_names[] = {
>> +    "in_w", "iw",   ///< width  of the input video
>> +    "in_h", "ih",   ///< height of the input video
>> +    "out_w", "ow",  ///< width  of the cropped video
>> +    "out_h", "oh",  ///< height of the cropped video
>> +    "x",
>> +    "y",
>> +    NULL
>> +};
>> +
>> +enum var_name {
>> +    VAR_IN_W,  VAR_IW,
>> +    VAR_IN_H,  VAR_IH,
>> +    VAR_OUT_W, VAR_OW,
>> +    VAR_OUT_H, VAR_OH,
>> +    VAR_X,
>> +    VAR_Y,
>> +    VAR_VARS_NB
>> +};
>> +
>> +static const enum AVPixelFormat supported_formats[] = {
>> +    AV_PIX_FMT_YUV420P,
>> +    AV_PIX_FMT_NV12,
>> +    AV_PIX_FMT_YUV444P,
>> +    AV_PIX_FMT_P010,
>> +    AV_PIX_FMT_P016
>> +};
>> +
>> +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
>> +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))
>> +#define NUM_BUFFERS 2
>> +#define BLOCKX 32
>> +#define BLOCKY 16
>> +
>> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
>> +
>> +typedef struct CUDACropContext {
>> +    const AVClass *class;
>> +    AVCUDADeviceContext *hwctx;
>> +    enum AVPixelFormat in_fmt;
>> +    enum AVPixelFormat out_fmt;
>> +
>> +    struct {
>> +        int width;
>> +        int height;
>> +        int left;
>> +        int top;
>> +    } planes_in[3], planes_out[3];
>> +
>> +    AVBufferRef *frames_ctx;
>> +    AVFrame     *frame;
>> +
>> +    AVFrame *tmp_frame;
>> +    int passthrough;
>> +
>> +    /**
>> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
>> +     */
>> +    enum AVPixelFormat format;
>> +
>> +    int w,h,x,y;
>> +    char *w_expr, *h_expr, *x_expr, *y_expr;
>> +    double var_values[VAR_VARS_NB];
>> +
>> +    CUcontext   cu_ctx;
>> +    CUmodule    cu_module;
>> +    CUfunction  cu_func_uchar;
>> +    CUfunction  cu_func_uchar2;
>> +    CUfunction  cu_func_uchar4;
>> +    CUfunction  cu_func_ushort;
>> +    CUfunction  cu_func_ushort2;
>> +    CUfunction  cu_func_ushort4;
>> +    CUstream    cu_stream;
>> +
>> +    CUdeviceptr srcBuffer;
>> +    CUdeviceptr dstBuffer;
>> +    int         tex_alignment;
>> +} CUDACropContext;
>> +
>> +static av_cold int cudacrop_init(AVFilterContext *ctx)
>> +{
>> +    CUDACropContext *s = ctx->priv;
>> +
>> +    s->format = AV_PIX_FMT_NONE;
>> +    s->frame = av_frame_alloc();
>> +    if (!s->frame)
>> +        return AVERROR(ENOMEM);
>> +
>> +    s->tmp_frame = av_frame_alloc();
>> +    if (!s->tmp_frame)
>> +        return AVERROR(ENOMEM);
>> +
>> +    return 0;
>> +}
>> +
>> +static av_cold void cudacrop_uninit(AVFilterContext *ctx)
>> +{
>> +    CUDACropContext *s = ctx->priv;
>> +
>> +    av_frame_free(&s->frame);
>> +    av_buffer_unref(&s->frames_ctx);
>> +    av_frame_free(&s->tmp_frame);
>> +}
>> +
>> +static int cudacrop_query_formats(AVFilterContext *ctx)
>> +{
>> +    static const enum AVPixelFormat pixel_formats[] = {
>> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
>> +    };
>> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
>> +
>> +    return ff_set_common_formats(ctx, pix_fmts);
>> +}
>> +
>> +static av_cold int init_stage(CUDACropContext *s, AVBufferRef *device_ctx)
>> +{
>> +    AVBufferRef *out_ref = NULL;
>> +    AVHWFramesContext *out_ctx;
>> +    int in_sw, in_sh, out_sw, out_sh;
>> +    int ret, i;
>> +
>> +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);
>> +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);
>> +    if (!s->planes_out[0].width) {
>> +        s->planes_out[0].width  = s->planes_in[0].width;
>> +        s->planes_out[0].height = s->planes_in[0].height;
>> +        s->planes_out[0].left = s->planes_in[0].left;
>> +        s->planes_out[0].top  = s->planes_in[0].top;
>> +    }
>> +
>> +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {
>> +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;
>> +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;
>> +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;
>> +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;
>> +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;
>> +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;
>> +        s->planes_out[i].left   = 0;
>> +        s->planes_out[i].top    = 0;
>> +
>> +    }
>> +
>> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
>> +    if (!out_ref)
>> +        return AVERROR(ENOMEM);
>> +    out_ctx = (AVHWFramesContext*)out_ref->data;
>> +
>> +    out_ctx->format    = AV_PIX_FMT_CUDA;
>> +    out_ctx->sw_format = s->out_fmt;
>> +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);
>> +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);
>> +
>> +    ret = av_hwframe_ctx_init(out_ref);
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    av_frame_unref(s->frame);
>> +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    s->frame->width  = s->planes_out[0].width;
>> +    s->frame->height = s->planes_out[0].height;
>> +
>> +    av_buffer_unref(&s->frames_ctx);
>> +    s->frames_ctx = out_ref;
>> +
>> +    return 0;
>> +fail:
>> +    av_buffer_unref(&out_ref);
>> +    return ret;
>> +}
>> +
>> +static int format_is_supported(enum AVPixelFormat fmt)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
>> +        if (supported_formats[i] == fmt)
>> +            return 1;
>> +    return 0;
>> +}
>> +
>> +static av_cold int init_processing_chain(AVFilterContext *ctx, int
>> in_width, int in_height,
>> +                                         int out_width, int out_height,
>> +                                         int left, int top)
>> +{
>> +    CUDACropContext *s = ctx->priv;
>> +
>> +    AVHWFramesContext *in_frames_ctx;
>> +
>> +    enum AVPixelFormat in_format;
>> +    enum AVPixelFormat out_format;
>> +    int ret;
>> +
>> +    /* check that we have a hw context */
>> +    if (!ctx->inputs[0]->hw_frames_ctx) {
>> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
>> +        return AVERROR(EINVAL);
>> +    }
>> +    in_frames_ctx =
>> (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
>> +    in_format     = in_frames_ctx->sw_format;
>> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format :
>> s->format;
>> +
>> +    if (!format_is_supported(in_format)) {
>> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
>> +               av_get_pix_fmt_name(in_format));
>> +        return AVERROR(ENOSYS);
>> +    }
>> +    if (!format_is_supported(out_format)) {
>> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
>> +               av_get_pix_fmt_name(out_format));
>> +        return AVERROR(ENOSYS);
>> +    }
>> +
>> +    if (in_width == out_width && in_height == out_height)
>> +        s->passthrough = 1;
>> +
>> +    s->in_fmt = in_format;
>> +    s->out_fmt = out_format;
>> +
>> +    s->planes_in[0].width   = in_width;
>> +    s->planes_in[0].height  = in_height;
>> +    s->planes_in[0].left = left;
>> +    s->planes_in[0].top = top;
>> +    s->planes_out[0].width  = out_width;
>> +    s->planes_out[0].height = out_height;
>> +    s->planes_out[0].left = 0;
>> +    s->planes_out[0].top = 0;
>> +
>> +    ret = init_stage(s, in_frames_ctx->device_ref);
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
>> +    if (!ctx->outputs[0]->hw_frames_ctx)
>> +        return AVERROR(ENOMEM);
>> +
>> +    return 0;
>> +}
>> +
>> +static inline int normalize_double(int *n, double d)
>> +{
>> +    int ret = 0;
>> +
>> +    if (isnan(d))
>> +        ret = AVERROR(EINVAL);
>> +    else if (d > INT_MAX || d < INT_MIN) {
>> +        *n = d > INT_MAX ? INT_MAX : INT_MIN;
>> +        ret = AVERROR(EINVAL);
>> +    } else
>> +        *n = lrint(d);
>> +
>> +    return ret;
>> +}
>> +
>> +static av_cold int cudacrop_config_input(AVFilterLink *inlink)
>> +{
>> +    AVFilterContext *ctx = inlink->dst;
>> +    CUDACropContext *s = ctx->priv;
>> +    double res;
>> +    int ret;
>> +
>> +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;
>> +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;
>> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;
>> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;
>> +    s->var_values[VAR_X] = NAN;
>> +    s->var_values[VAR_Y] = NAN;
>> +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,
>> +                                      var_names, s->var_values,
>> +                                      NULL, NULL, NULL, NULL, NULL, 0,
>> ctx)) < 0)
>> +        goto fail;
>> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;
>> +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,
>> +                                      var_names, s->var_values,
>> +                                      NULL, NULL, NULL, NULL, NULL, 0,
>> ctx)) < 0)
>> +        goto fail;
>> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;
>> +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,
>> +                                      var_names, s->var_values,
>> +                                      NULL, NULL, NULL, NULL, NULL, 0,
>> ctx)) < 0)
>> +        goto fail;
>> +    s->var_values[VAR_X] = res;
>> +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,
>> +                                      var_names, s->var_values,
>> +                                      NULL, NULL, NULL, NULL, NULL, 0,
>> ctx)) < 0)
>> +        goto fail;
>> +    s->var_values[VAR_Y] = res;
>> +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||
>> +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||
>> +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||
>> +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {
>> +        av_log(ctx, AV_LOG_ERROR,
>> +               "Too big value or invalid expression for out_w/ow or
>> out_h/oh or x or y");
>> +        return AVERROR(EINVAL);
>> +    }
>> +
>> +fail:
>> +    return ret;
>> +}
>> +
>> +static av_cold int cudacrop_config_output(AVFilterLink *outlink)
>> +{
>> +    AVFilterContext *ctx = outlink->src;
>> +    AVFilterLink *inlink = outlink->src->inputs[0];
>> +    CUDACropContext *s  = ctx->priv;
>> +    AVHWFramesContext     *frames_ctx =
>> (AVHWFramesContext*)inlink->hw_frames_ctx->data;
>> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
>> +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
>> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
>> +    int ret;
>> +
>> +    extern char vf_crop_cuda_ptx[];
>> +
>> +    s->hwctx = device_hwctx;
>> +    s->cu_stream = s->hwctx->stream;
>> +
>> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_crop_cuda_ptx));
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module,
>> "Crop_uchar"));
>> +    if (ret < 0)
>> +        goto fail;
>> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module,
>> "Crop_uchar2"));
>> +    if (ret < 0)
>> +        goto fail;
>> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module,
>> "Crop_uchar4"));
>> +    if (ret < 0)
>> +        goto fail;
>> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module,
>> "Crop_ushort"));
>> +    if (ret < 0)
>> +        goto fail;
>> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module,
>> "Crop_ushort2"));
>> +    if (ret < 0)
>> +        goto fail;
>> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module,
>> "Crop_ushort4"));
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
>> +
>> +    outlink->w = s->w;
>> +    outlink->h = s->h;
>> +
>> +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w, s->h,
>> s->x, s->y);
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    if (inlink->sample_aspect_ratio.num) {
>> +        outlink->sample_aspect_ratio =
>> av_mul_q((AVRational){outlink->h*inlink->w,
>> +
>> outlink->w*inlink->h},
>> +
>> inlink->sample_aspect_ratio);
>> +    } else {
>> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
>> +    }
>> +
>> +    return 0;
>> +
>> +fail:
>> +    return ret;
>> +}
>> +
>> +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func, int
>> channels,
>> +                              uint8_t *src_dptr, int src_width, int
>> src_height, int src_pitch,
>> +                              uint8_t *dst_dptr, int dst_width, int
>> dst_height, int dst_pitch,
>> +                              int left, int top, int pixel_size)
>> +{
>> +    CUDACropContext *s = ctx->priv;
>> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
>> +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
>> +    CUtexObject tex = 0;
>> +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height,
>> &dst_pitch, &left, &top };
>> +    int ret;
>> +
>> +    CUDA_TEXTURE_DESC tex_desc = {
>> +        .filterMode = CU_TR_FILTER_MODE_LINEAR,
>> +        .flags = CU_TRSF_READ_AS_INTEGER,
>> +    };
>> +
>> +    CUDA_RESOURCE_DESC res_desc = {
>> +        .resType = CU_RESOURCE_TYPE_PITCH2D,
>> +        .res.pitch2D.format = pixel_size == 1 ?
>> +                              CU_AD_FORMAT_UNSIGNED_INT8 :
>> +                              CU_AD_FORMAT_UNSIGNED_INT16,
>> +        .res.pitch2D.numChannels = channels,
>> +        .res.pitch2D.width = src_width,
>> +        .res.pitch2D.height = src_height,
>> +        .res.pitch2D.pitchInBytes = src_pitch,
>> +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
>> +    };
>> +
>> +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,
>> NULL));
>> +    if (ret < 0)
>> +        goto exit;
>> +
>> +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX),
>> DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
>> args_uchar, NULL));
>> +
>> +exit:
>> +    if (tex)
>> +        CHECK_CU(cu->cuTexObjectDestroy(tex));
>> +    return ret;
>> +}
>> +
>> +static int cropcuda_crop_internal(AVFilterContext *ctx,
>> +                            AVFrame *out, AVFrame *in)
>> +{
>> +    AVHWFramesContext *in_frames_ctx =
>> (AVHWFramesContext*)in->hw_frames_ctx->data;
>> +    CUDACropContext *s = ctx->priv;
>> +
>> +    switch (in_frames_ctx->sw_format) {
>> +    case AV_PIX_FMT_YUV420P:
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0], in->width, in->height,
>> in->linesize[0],
>> +                           out->data[0], out->width, out->height,
>> out->linesize[0],
>> +                           s->planes_in[0].left, s->planes_in[0].top, 1);
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0]+in->linesize[0]*in->height,
>> in->width/2, in->height/2, in->linesize[0]/2,
>> +                           out->data[0]+out->linesize[0]*out->height,
>> out->width/2, out->height/2, out->linesize[0]/2,
>> +                           s->planes_in[1].left, s->planes_in[1].top, 1);
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0]+
>> ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2,
>> in->height/2, in->linesize[0]/2,
>> +
>> out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,
>> out->height/2, out->linesize[0]/2,
>> +                           s->planes_in[2].left, s->planes_in[2].top, 1);
>> +        break;
>> +    case AV_PIX_FMT_YUV444P:
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0], in->width, in->height,
>> in->linesize[0],
>> +                           out->data[0], out->width, out->height,
>> out->linesize[0],
>> +                           s->planes_in[0].left, s->planes_in[0].top, 1);
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0]+in->linesize[0]*in->height,
>> in->width, in->height, in->linesize[0],
>> +                           out->data[0]+out->linesize[0]*out->height,
>> out->width, out->height, out->linesize[0],
>> +                           s->planes_in[1].left, s->planes_in[1].top, 1);
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0]+in->linesize[0]*in->height*2,
>> in->width, in->height, in->linesize[0],
>> +                           out->data[0]+out->linesize[0]*out->height*2,
>> out->width, out->height, out->linesize[0],
>> +                           s->planes_in[2].left, s->planes_in[2].top, 1);
>> +        break;
>> +    case AV_PIX_FMT_NV12:
>> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
>> +                           in->data[0], in->width, in->height,
>> in->linesize[0],
>> +                           out->data[0], out->width, out->height,
>> out->linesize[0],
>> +                           s->planes_in[0].left, s->planes_in[0].top, 1);
>> +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,
>> +                           in->data[1], in->width/2, in->height/2,
>> in->linesize[1],
>> +                           out->data[0] + out->linesize[0] *
>> ((out->height + 31) & ~0x1f), out->width/2, out->height/2,
>> out->linesize[1]/2,
>> +                           s->planes_in[1].left, s->planes_in[1].top, 1);
>> +        break;
>> +    case AV_PIX_FMT_P010LE:
>> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
>> +                           in->data[0], in->width, in->height,
>> in->linesize[0]/2,
>> +                           out->data[0], out->width, out->height,
>> out->linesize[0]/2,
>> +                           s->planes_in[0].left, s->planes_in[0].top, 2);
>> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
>> +                           in->data[1], in->width / 2, in->height / 2,
>> in->linesize[1]/2,
>> +                           out->data[0] + out->linesize[0] *
>> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
>> out->linesize[1] / 4,
>> +                           s->planes_in[1].left, s->planes_in[1].top, 2);
>> +        break;
>> +    case AV_PIX_FMT_P016LE:
>> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
>> +                           in->data[0], in->width, in->height,
>> in->linesize[0] / 2,
>> +                           out->data[0], out->width, out->height,
>> out->linesize[0] / 2,
>> +                           s->planes_in[0].left, s->planes_in[0].top, 2);
>> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
>> +                           in->data[1], in->width / 2, in->height / 2,
>> in->linesize[1] / 2,
>> +                           out->data[0] + out->linesize[0] *
>> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
>> out->linesize[1] / 4,
>> +                           s->planes_in[1].left, s->planes_in[1].top, 2);
>> +        break;
>> +    default:
>> +        return AVERROR_BUG;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
>> +{
>> +    CUDACropContext *s = ctx->priv;
>> +    AVFrame *src = in;
>> +    int ret;
>> +
>> +    ret = cropcuda_crop_internal(ctx, s->frame, src);
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    src = s->frame;
>> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    av_frame_move_ref(out, s->frame);
>> +    av_frame_move_ref(s->frame, s->tmp_frame);
>> +
>> +    ret = av_frame_copy_props(out, in);
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    return 0;
>> +}
>> +
>> +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)
>> +{
>> +    AVFilterContext              *ctx = link->dst;
>> +    CUDACropContext              *s = ctx->priv;
>> +    AVFilterLink             *outlink = ctx->outputs[0];
>> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
>> +
>> +    AVFrame *out = NULL;
>> +    CUcontext dummy;
>> +    int ret = 0;
>> +
>> +    out = av_frame_alloc();
>> +    if (!out) {
>> +        ret = AVERROR(ENOMEM);
>> +        goto fail;
>> +    }
>> +
>> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    ret = cudacrop_crop(ctx, out, in);
>> +
>> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
>> +    if (ret < 0)
>> +        goto fail;
>> +
>> +    av_reduce(&out->sample_aspect_ratio.num,
>> &out->sample_aspect_ratio.den,
>> +              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
>> +              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
>> +              INT_MAX);
>> +
>> +    av_frame_free(&in);
>> +    return ff_filter_frame(outlink, out);
>> +fail:
>> +    av_frame_free(&in);
>> +    av_frame_free(&out);
>> +    return ret;
>> +}
>> +
>> +#define OFFSET(x) offsetof(CUDACropContext, x)
>> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
>> +static const AVOption options[] = {
>> +    { "w",      "set the width crop area expression",  OFFSET(w_expr),
>> AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags = FLAGS },
>> +    { "h",      "set the height crop area expression", OFFSET(h_expr),
>> AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags = FLAGS },
>> +    { "x",      "set the x crop area expression",      OFFSET(x_expr),
>> AV_OPT_TYPE_STRING, { .str = "(in_w-out_w)/2"}, .flags = FLAGS },
>> +    { "y",      "set the y crop area expression",      OFFSET(y_expr),
>> AV_OPT_TYPE_STRING, { .str = "(in_h-out_h)/2"}, .flags = FLAGS },
>> +    { NULL },
>> +};
>> +
>> +static const AVClass cudacrop_class = {
>> +    .class_name = "cudacrop",
>> +    .item_name  = av_default_item_name,
>> +    .option     = options,
>> +    .version    = LIBAVUTIL_VERSION_INT,
>> +};
>> +
>> +static const AVFilterPad cudacrop_inputs[] = {
>> +    {
>> +        .name        = "default",
>> +        .type        = AVMEDIA_TYPE_VIDEO,
>> +        .filter_frame = cudacrop_filter_frame,
>> +        .config_props = cudacrop_config_input,
>> +    },
>> +    { NULL }
>> +};
>> +
>> +static const AVFilterPad cudacrop_outputs[] = {
>> +    {
>> +        .name         = "default",
>> +        .type         = AVMEDIA_TYPE_VIDEO,
>> +        .config_props = cudacrop_config_output,
>> +    },
>> +    { NULL }
>> +};
>> +
>> +AVFilter ff_vf_crop_cuda = {
>> +    .name      = "crop_cuda",
>> +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video crop"),
>> +
>> +    .init          = cudacrop_init,
>> +    .uninit        = cudacrop_uninit,
>> +    .query_formats = cudacrop_query_formats,
>> +
>> +    .priv_size = sizeof(CUDACropContext),
>> +    .priv_class = &cudacrop_class,
>> +
>> +    .inputs    = cudacrop_inputs,
>> +    .outputs   = cudacrop_outputs,
>> +
>> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
>> +};
>> diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu
>> new file mode 100644
>> index 0000000..4b94b73
>> --- /dev/null
>> +++ b/libavfilter/vf_crop_cuda.cu
>> @@ -0,0 +1,109 @@
>> +/*
>> + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the
>> "Software"),
>> + * to deal in the Software without restriction, including without
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute,
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be
>> included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> + * DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +extern "C" {
>> +
>> +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,
>> +                           unsigned char *dst,
>> +                           int dst_width, int dst_height, int dst_pitch,
>> +                           int left, int top)
>> +{
>> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
>> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
>> +    int xi = xo + left;
>> +    int yi = yo + top;
>> +
>> +    if (yo < dst_height && xo < dst_width)
>> +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned
>> char>(uchar_tex, xi, yi);
>> +}
>> +
>> +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,
>> +                            uchar2 *dst,
>> +                            int dst_width, int dst_height, int dst_pitch,
>> +                            int left, int top)
>> +{
>> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
>> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
>> +    int xi = xo + left;
>> +    int yi = yo + top;
>> +
>> +    if (yo < dst_height && xo < dst_width)
>> +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex, xi, yi);
>> +}
>> +
>> +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,
>> +                            uchar4 *dst,
>> +                            int dst_width, int dst_height, int dst_pitch,
>> +                            int left, int top)
>> +{
>> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
>> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
>> +    int xi = xo + left;
>> +    int yi = yo + top;
>> +
>> +    if (yo < dst_height && xo < dst_width)
>> +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex, xi, yi);
>> +}
>> +
>> +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,
>> +                            unsigned short *dst,
>> +                            int dst_width, int dst_height, int dst_pitch,
>> +                            int left, int top)
>> +{
>> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
>> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
>> +    int xi = xo + left;
>> +    int yi = yo + top;
>> +
>> +    if (yo < dst_height && xo < dst_width)
>> +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned
>> short>(ushort_tex, xi, yi);
>> +}
>> +
>> +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,
>> +                             ushort2 *dst,
>> +                             int dst_width, int dst_height, int dst_pitch,
>> +                             int left, int top)
>> +{
>> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
>> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
>> +    int xi = xo + left;
>> +    int yi = yo + top;
>> +
>> +    if (yo < dst_height && xo < dst_width)
>> +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex, xi,
>> yi);
>> +}
>> +
>> +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,
>> +                             ushort4 *dst,
>> +                             int dst_width, int dst_height, int dst_pitch,
>> +                             int left, int top)
>> +{
>> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
>> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
>> +    int xi = xo + left;
>> +    int yi = yo + top;
>> +
>> +    if (yo < dst_height && xo < dst_width)
>> +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex, xi,
>> yi);
>> +}
>> +
>> +}
>> --
>> 1.8.3.1
>> 
>> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Thanks
Steven
leozhang March 24, 2019, 1:48 a.m. UTC | #3
Got it. Thanks Steven

Steven Liu <lq@chinaffmpeg.org> 于2019年3月24日周日 上午9:00写道:

>
>
> > 在 2019年3月24日,07:26,Tao Zhang <nowerzt@gmail.com> 写道:
> >
> > The corrected version. If there are no other comments or objections,
> could
> > this be pushed?
> Of course, maybe this need waiting for other reviewer, about 24 hours
> after maybe better than push it now.
> Because other reviewer maybe busy or not online weekend.
>
> >
> > UsingtcNower <nowerzt@gmail.com> 于2019年3月23日周六 下午11:51写道:
> >
> >> Signed-off-by: UsingtcNower <nowerzt@gmail.com>
> >> ---
> >> Changelog                   |   1 +
> >> configure                   |   1 +
> >> doc/filters.texi            |  31 +++
> >> libavfilter/Makefile        |   1 +
> >> libavfilter/allfilters.c    |   1 +
> >> libavfilter/version.h       |   2 +-
> >> libavfilter/vf_crop_cuda.c  | 638
> >> ++++++++++++++++++++++++++++++++++++++++++++
> >> libavfilter/vf_crop_cuda.cu | 109 ++++++++
> >> 8 files changed, 783 insertions(+), 1 deletion(-)
> >> create mode 100644 libavfilter/vf_crop_cuda.c
> >> create mode 100644 libavfilter/vf_crop_cuda.cu
> >>
> >> diff --git a/Changelog b/Changelog
> >> index ad7e82f..f224fc8 100644
> >> --- a/Changelog
> >> +++ b/Changelog
> >> @@ -20,6 +20,7 @@ version <next>:
> >> - libaribb24 based ARIB STD-B24 caption support (profiles A and C)
> >> - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec
> >> - removed libndi-newtek
> >> +- crop_cuda GPU accelerated video crop filter
> >>
> >>
> >> version 4.1:
> >> diff --git a/configure b/configure
> >> index 331393f..3f3ac2f 100755
> >> --- a/configure
> >> +++ b/configure
> >> @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"
> >> vaapi_encode_deps="vaapi"
> >> v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
> >>
> >> +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"
> >> hwupload_cuda_filter_deps="ffnvcodec"
> >> scale_npp_filter_deps="ffnvcodec libnpp"
> >> scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
> >> diff --git a/doc/filters.texi b/doc/filters.texi
> >> index 4ffb392..ee16a2d 100644
> >> --- a/doc/filters.texi
> >> +++ b/doc/filters.texi
> >> @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it is
> >> kept at its current
> >> value.
> >> @end table
> >>
> >> +@section crop_cuda
> >> +
> >> +Crop the input video to given dimensions, implemented in CUDA.
> >> +
> >> +It accepts the following parameters:
> >> +
> >> +@table @option
> >> +
> >> +@item w
> >> +The width of the output video. It defaults to @code{iw}.
> >> +This expression is evaluated only once during the filter
> >> +configuration.
> >> +
> >> +@item h
> >> +The height of the output video. It defaults to @code{ih}.
> >> +This expression is evaluated only once during the filter
> >> +configuration.
> >> +
> >> +@item x
> >> +The horizontal position, in the input video, of the left edge of the
> >> output
> >> +video. It defaults to @code{(in_w-out_w)/2}.
> >> +This expression is evaluated only once during the filter
> >> +configuration.
> >> +
> >> +@item y
> >> +The vertical position, in the input video, of the top edge of the
> output
> >> video.
> >> +It defaults to @code{(in_h-out_h)/2}.
> >> +This expression is evaluated only once during the filter
> >> +configuration.
> >> +@end table
> >> +
> >> @section cropdetect
> >>
> >> Auto-detect the crop size.
> >> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> >> index fef6ec5..84df037 100644
> >> --- a/libavfilter/Makefile
> >> +++ b/libavfilter/Makefile
> >> @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   +=
> >> vf_copy.o
> >> OBJS-$(CONFIG_COREIMAGE_FILTER)              += vf_coreimage.o
> >> OBJS-$(CONFIG_COVER_RECT_FILTER)             += vf_cover_rect.o
> >> lavfutils.o
> >> OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o
> >> +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o
> >> vf_crop_cuda.ptx.o
> >> OBJS-$(CONFIG_CROPDETECT_FILTER)             += vf_cropdetect.o
> >> OBJS-$(CONFIG_CUE_FILTER)                    += f_cue.o
> >> OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o
> >> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> >> index c51ae0f..550e545 100644
> >> --- a/libavfilter/allfilters.c
> >> +++ b/libavfilter/allfilters.c
> >> @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;
> >> extern AVFilter ff_vf_coreimage;
> >> extern AVFilter ff_vf_cover_rect;
> >> extern AVFilter ff_vf_crop;
> >> +extern AVFilter ff_vf_crop_cuda;
> >> extern AVFilter ff_vf_cropdetect;
> >> extern AVFilter ff_vf_cue;
> >> extern AVFilter ff_vf_curves;
> >> diff --git a/libavfilter/version.h b/libavfilter/version.h
> >> index c71282c..5aa95f4 100644
> >> --- a/libavfilter/version.h
> >> +++ b/libavfilter/version.h
> >> @@ -31,7 +31,7 @@
> >>
> >> #define LIBAVFILTER_VERSION_MAJOR   7
> >> #define LIBAVFILTER_VERSION_MINOR  48
> >> -#define LIBAVFILTER_VERSION_MICRO 100
> >> +#define LIBAVFILTER_VERSION_MICRO 101
> >>
> >> #define LIBAVFILTER_VERSION_INT
> AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR,
> >> \
> >>
> LIBAVFILTER_VERSION_MINOR,
> >> \
> >> diff --git a/libavfilter/vf_crop_cuda.c b/libavfilter/vf_crop_cuda.c
> >> new file mode 100644
> >> index 0000000..fc6a2a6
> >> --- /dev/null
> >> +++ b/libavfilter/vf_crop_cuda.c
> >> @@ -0,0 +1,638 @@
> >> +/*
> >> +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> >> +*
> >> +* Permission is hereby granted, free of charge, to any person
> obtaining a
> >> +* copy of this software and associated documentation files (the
> >> "Software"),
> >> +* to deal in the Software without restriction, including without
> >> limitation
> >> +* the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> >> +* and/or sell copies of the Software, and to permit persons to whom the
> >> +* Software is furnished to do so, subject to the following conditions:
> >> +*
> >> +* The above copyright notice and this permission notice shall be
> included
> >> in
> >> +* all copies or substantial portions of the Software.
> >> +*
> >> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS
> >> OR
> >> +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> >> +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> >> +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> >> OTHER
> >> +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
> >> +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> >> +* DEALINGS IN THE SOFTWARE.
> >> +*/
> >> +
> >> +#include <stdio.h>
> >> +#include <string.h>
> >> +
> >> +#include "libavutil/avstring.h"
> >> +#include "libavutil/common.h"
> >> +#include "libavutil/hwcontext.h"
> >> +#include "libavutil/hwcontext_cuda_internal.h"
> >> +#include "libavutil/cuda_check.h"
> >> +#include "libavutil/internal.h"
> >> +#include "libavutil/opt.h"
> >> +#include "libavutil/pixdesc.h"
> >> +#include "libavutil/eval.h"
> >> +
> >> +#include "avfilter.h"
> >> +#include "formats.h"
> >> +#include "internal.h"
> >> +#include "video.h"
> >> +
> >> +static const char *const var_names[] = {
> >> +    "in_w", "iw",   ///< width  of the input video
> >> +    "in_h", "ih",   ///< height of the input video
> >> +    "out_w", "ow",  ///< width  of the cropped video
> >> +    "out_h", "oh",  ///< height of the cropped video
> >> +    "x",
> >> +    "y",
> >> +    NULL
> >> +};
> >> +
> >> +enum var_name {
> >> +    VAR_IN_W,  VAR_IW,
> >> +    VAR_IN_H,  VAR_IH,
> >> +    VAR_OUT_W, VAR_OW,
> >> +    VAR_OUT_H, VAR_OH,
> >> +    VAR_X,
> >> +    VAR_Y,
> >> +    VAR_VARS_NB
> >> +};
> >> +
> >> +static const enum AVPixelFormat supported_formats[] = {
> >> +    AV_PIX_FMT_YUV420P,
> >> +    AV_PIX_FMT_NV12,
> >> +    AV_PIX_FMT_YUV444P,
> >> +    AV_PIX_FMT_P010,
> >> +    AV_PIX_FMT_P016
> >> +};
> >> +
> >> +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
> >> +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))
> >> +#define NUM_BUFFERS 2
> >> +#define BLOCKX 32
> >> +#define BLOCKY 16
> >> +
> >> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl,
> x)
> >> +
> >> +typedef struct CUDACropContext {
> >> +    const AVClass *class;
> >> +    AVCUDADeviceContext *hwctx;
> >> +    enum AVPixelFormat in_fmt;
> >> +    enum AVPixelFormat out_fmt;
> >> +
> >> +    struct {
> >> +        int width;
> >> +        int height;
> >> +        int left;
> >> +        int top;
> >> +    } planes_in[3], planes_out[3];
> >> +
> >> +    AVBufferRef *frames_ctx;
> >> +    AVFrame     *frame;
> >> +
> >> +    AVFrame *tmp_frame;
> >> +    int passthrough;
> >> +
> >> +    /**
> >> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
> >> +     */
> >> +    enum AVPixelFormat format;
> >> +
> >> +    int w,h,x,y;
> >> +    char *w_expr, *h_expr, *x_expr, *y_expr;
> >> +    double var_values[VAR_VARS_NB];
> >> +
> >> +    CUcontext   cu_ctx;
> >> +    CUmodule    cu_module;
> >> +    CUfunction  cu_func_uchar;
> >> +    CUfunction  cu_func_uchar2;
> >> +    CUfunction  cu_func_uchar4;
> >> +    CUfunction  cu_func_ushort;
> >> +    CUfunction  cu_func_ushort2;
> >> +    CUfunction  cu_func_ushort4;
> >> +    CUstream    cu_stream;
> >> +
> >> +    CUdeviceptr srcBuffer;
> >> +    CUdeviceptr dstBuffer;
> >> +    int         tex_alignment;
> >> +} CUDACropContext;
> >> +
> >> +static av_cold int cudacrop_init(AVFilterContext *ctx)
> >> +{
> >> +    CUDACropContext *s = ctx->priv;
> >> +
> >> +    s->format = AV_PIX_FMT_NONE;
> >> +    s->frame = av_frame_alloc();
> >> +    if (!s->frame)
> >> +        return AVERROR(ENOMEM);
> >> +
> >> +    s->tmp_frame = av_frame_alloc();
> >> +    if (!s->tmp_frame)
> >> +        return AVERROR(ENOMEM);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static av_cold void cudacrop_uninit(AVFilterContext *ctx)
> >> +{
> >> +    CUDACropContext *s = ctx->priv;
> >> +
> >> +    av_frame_free(&s->frame);
> >> +    av_buffer_unref(&s->frames_ctx);
> >> +    av_frame_free(&s->tmp_frame);
> >> +}
> >> +
> >> +static int cudacrop_query_formats(AVFilterContext *ctx)
> >> +{
> >> +    static const enum AVPixelFormat pixel_formats[] = {
> >> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
> >> +    };
> >> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
> >> +
> >> +    return ff_set_common_formats(ctx, pix_fmts);
> >> +}
> >> +
> >> +static av_cold int init_stage(CUDACropContext *s, AVBufferRef
> *device_ctx)
> >> +{
> >> +    AVBufferRef *out_ref = NULL;
> >> +    AVHWFramesContext *out_ctx;
> >> +    int in_sw, in_sh, out_sw, out_sh;
> >> +    int ret, i;
> >> +
> >> +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);
> >> +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);
> >> +    if (!s->planes_out[0].width) {
> >> +        s->planes_out[0].width  = s->planes_in[0].width;
> >> +        s->planes_out[0].height = s->planes_in[0].height;
> >> +        s->planes_out[0].left = s->planes_in[0].left;
> >> +        s->planes_out[0].top  = s->planes_in[0].top;
> >> +    }
> >> +
> >> +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {
> >> +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;
> >> +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;
> >> +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;
> >> +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;
> >> +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;
> >> +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;
> >> +        s->planes_out[i].left   = 0;
> >> +        s->planes_out[i].top    = 0;
> >> +
> >> +    }
> >> +
> >> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> >> +    if (!out_ref)
> >> +        return AVERROR(ENOMEM);
> >> +    out_ctx = (AVHWFramesContext*)out_ref->data;
> >> +
> >> +    out_ctx->format    = AV_PIX_FMT_CUDA;
> >> +    out_ctx->sw_format = s->out_fmt;
> >> +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);
> >> +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);
> >> +
> >> +    ret = av_hwframe_ctx_init(out_ref);
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    av_frame_unref(s->frame);
> >> +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    s->frame->width  = s->planes_out[0].width;
> >> +    s->frame->height = s->planes_out[0].height;
> >> +
> >> +    av_buffer_unref(&s->frames_ctx);
> >> +    s->frames_ctx = out_ref;
> >> +
> >> +    return 0;
> >> +fail:
> >> +    av_buffer_unref(&out_ref);
> >> +    return ret;
> >> +}
> >> +
> >> +static int format_is_supported(enum AVPixelFormat fmt)
> >> +{
> >> +    int i;
> >> +
> >> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> >> +        if (supported_formats[i] == fmt)
> >> +            return 1;
> >> +    return 0;
> >> +}
> >> +
> >> +static av_cold int init_processing_chain(AVFilterContext *ctx, int
> >> in_width, int in_height,
> >> +                                         int out_width, int out_height,
> >> +                                         int left, int top)
> >> +{
> >> +    CUDACropContext *s = ctx->priv;
> >> +
> >> +    AVHWFramesContext *in_frames_ctx;
> >> +
> >> +    enum AVPixelFormat in_format;
> >> +    enum AVPixelFormat out_format;
> >> +    int ret;
> >> +
> >> +    /* check that we have a hw context */
> >> +    if (!ctx->inputs[0]->hw_frames_ctx) {
> >> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
> >> +        return AVERROR(EINVAL);
> >> +    }
> >> +    in_frames_ctx =
> >> (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> >> +    in_format     = in_frames_ctx->sw_format;
> >> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format :
> >> s->format;
> >> +
> >> +    if (!format_is_supported(in_format)) {
> >> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
> >> +               av_get_pix_fmt_name(in_format));
> >> +        return AVERROR(ENOSYS);
> >> +    }
> >> +    if (!format_is_supported(out_format)) {
> >> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
> >> +               av_get_pix_fmt_name(out_format));
> >> +        return AVERROR(ENOSYS);
> >> +    }
> >> +
> >> +    if (in_width == out_width && in_height == out_height)
> >> +        s->passthrough = 1;
> >> +
> >> +    s->in_fmt = in_format;
> >> +    s->out_fmt = out_format;
> >> +
> >> +    s->planes_in[0].width   = in_width;
> >> +    s->planes_in[0].height  = in_height;
> >> +    s->planes_in[0].left = left;
> >> +    s->planes_in[0].top = top;
> >> +    s->planes_out[0].width  = out_width;
> >> +    s->planes_out[0].height = out_height;
> >> +    s->planes_out[0].left = 0;
> >> +    s->planes_out[0].top = 0;
> >> +
> >> +    ret = init_stage(s, in_frames_ctx->device_ref);
> >> +    if (ret < 0)
> >> +        return ret;
> >> +
> >> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
> >> +    if (!ctx->outputs[0]->hw_frames_ctx)
> >> +        return AVERROR(ENOMEM);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static inline int normalize_double(int *n, double d)
> >> +{
> >> +    int ret = 0;
> >> +
> >> +    if (isnan(d))
> >> +        ret = AVERROR(EINVAL);
> >> +    else if (d > INT_MAX || d < INT_MIN) {
> >> +        *n = d > INT_MAX ? INT_MAX : INT_MIN;
> >> +        ret = AVERROR(EINVAL);
> >> +    } else
> >> +        *n = lrint(d);
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static av_cold int cudacrop_config_input(AVFilterLink *inlink)
> >> +{
> >> +    AVFilterContext *ctx = inlink->dst;
> >> +    CUDACropContext *s = ctx->priv;
> >> +    double res;
> >> +    int ret;
> >> +
> >> +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;
> >> +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;
> >> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;
> >> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;
> >> +    s->var_values[VAR_X] = NAN;
> >> +    s->var_values[VAR_Y] = NAN;
> >> +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,
> >> +                                      var_names, s->var_values,
> >> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> >> ctx)) < 0)
> >> +        goto fail;
> >> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;
> >> +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,
> >> +                                      var_names, s->var_values,
> >> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> >> ctx)) < 0)
> >> +        goto fail;
> >> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;
> >> +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,
> >> +                                      var_names, s->var_values,
> >> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> >> ctx)) < 0)
> >> +        goto fail;
> >> +    s->var_values[VAR_X] = res;
> >> +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,
> >> +                                      var_names, s->var_values,
> >> +                                      NULL, NULL, NULL, NULL, NULL, 0,
> >> ctx)) < 0)
> >> +        goto fail;
> >> +    s->var_values[VAR_Y] = res;
> >> +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||
> >> +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||
> >> +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||
> >> +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {
> >> +        av_log(ctx, AV_LOG_ERROR,
> >> +               "Too big value or invalid expression for out_w/ow or
> >> out_h/oh or x or y");
> >> +        return AVERROR(EINVAL);
> >> +    }
> >> +
> >> +fail:
> >> +    return ret;
> >> +}
> >> +
> >> +static av_cold int cudacrop_config_output(AVFilterLink *outlink)
> >> +{
> >> +    AVFilterContext *ctx = outlink->src;
> >> +    AVFilterLink *inlink = outlink->src->inputs[0];
> >> +    CUDACropContext *s  = ctx->priv;
> >> +    AVHWFramesContext     *frames_ctx =
> >> (AVHWFramesContext*)inlink->hw_frames_ctx->data;
> >> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
> >> +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
> >> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
> >> +    int ret;
> >> +
> >> +    extern char vf_crop_cuda_ptx[];
> >> +
> >> +    s->hwctx = device_hwctx;
> >> +    s->cu_stream = s->hwctx->stream;
> >> +
> >> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module,
> vf_crop_cuda_ptx));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module,
> >> "Crop_uchar"));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module,
> >> "Crop_uchar2"));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module,
> >> "Crop_uchar4"));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module,
> >> "Crop_ushort"));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module,
> >> "Crop_ushort2"));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module,
> >> "Crop_ushort4"));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> >> +
> >> +    outlink->w = s->w;
> >> +    outlink->h = s->h;
> >> +
> >> +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w, s->h,
> >> s->x, s->y);
> >> +    if (ret < 0)
> >> +        return ret;
> >> +
> >> +    if (inlink->sample_aspect_ratio.num) {
> >> +        outlink->sample_aspect_ratio =
> >> av_mul_q((AVRational){outlink->h*inlink->w,
> >> +
> >> outlink->w*inlink->h},
> >> +
> >> inlink->sample_aspect_ratio);
> >> +    } else {
> >> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> >> +    }
> >> +
> >> +    return 0;
> >> +
> >> +fail:
> >> +    return ret;
> >> +}
> >> +
> >> +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func, int
> >> channels,
> >> +                              uint8_t *src_dptr, int src_width, int
> >> src_height, int src_pitch,
> >> +                              uint8_t *dst_dptr, int dst_width, int
> >> dst_height, int dst_pitch,
> >> +                              int left, int top, int pixel_size)
> >> +{
> >> +    CUDACropContext *s = ctx->priv;
> >> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> >> +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
> >> +    CUtexObject tex = 0;
> >> +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height,
> >> &dst_pitch, &left, &top };
> >> +    int ret;
> >> +
> >> +    CUDA_TEXTURE_DESC tex_desc = {
> >> +        .filterMode = CU_TR_FILTER_MODE_LINEAR,
> >> +        .flags = CU_TRSF_READ_AS_INTEGER,
> >> +    };
> >> +
> >> +    CUDA_RESOURCE_DESC res_desc = {
> >> +        .resType = CU_RESOURCE_TYPE_PITCH2D,
> >> +        .res.pitch2D.format = pixel_size == 1 ?
> >> +                              CU_AD_FORMAT_UNSIGNED_INT8 :
> >> +                              CU_AD_FORMAT_UNSIGNED_INT16,
> >> +        .res.pitch2D.numChannels = channels,
> >> +        .res.pitch2D.width = src_width,
> >> +        .res.pitch2D.height = src_height,
> >> +        .res.pitch2D.pitchInBytes = src_pitch,
> >> +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
> >> +    };
> >> +
> >> +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,
> >> NULL));
> >> +    if (ret < 0)
> >> +        goto exit;
> >> +
> >> +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX),
> >> DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
> >> args_uchar, NULL));
> >> +
> >> +exit:
> >> +    if (tex)
> >> +        CHECK_CU(cu->cuTexObjectDestroy(tex));
> >> +    return ret;
> >> +}
> >> +
> >> +static int cropcuda_crop_internal(AVFilterContext *ctx,
> >> +                            AVFrame *out, AVFrame *in)
> >> +{
> >> +    AVHWFramesContext *in_frames_ctx =
> >> (AVHWFramesContext*)in->hw_frames_ctx->data;
> >> +    CUDACropContext *s = ctx->priv;
> >> +
> >> +    switch (in_frames_ctx->sw_format) {
> >> +    case AV_PIX_FMT_YUV420P:
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0], in->width, in->height,
> >> in->linesize[0],
> >> +                           out->data[0], out->width, out->height,
> >> out->linesize[0],
> >> +                           s->planes_in[0].left, s->planes_in[0].top,
> 1);
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0]+in->linesize[0]*in->height,
> >> in->width/2, in->height/2, in->linesize[0]/2,
> >> +                           out->data[0]+out->linesize[0]*out->height,
> >> out->width/2, out->height/2, out->linesize[0]/2,
> >> +                           s->planes_in[1].left, s->planes_in[1].top,
> 1);
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0]+
> >> ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment),
> in->width/2,
> >> in->height/2, in->linesize[0]/2,
> >> +
> >> out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,
> >> out->height/2, out->linesize[0]/2,
> >> +                           s->planes_in[2].left, s->planes_in[2].top,
> 1);
> >> +        break;
> >> +    case AV_PIX_FMT_YUV444P:
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0], in->width, in->height,
> >> in->linesize[0],
> >> +                           out->data[0], out->width, out->height,
> >> out->linesize[0],
> >> +                           s->planes_in[0].left, s->planes_in[0].top,
> 1);
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0]+in->linesize[0]*in->height,
> >> in->width, in->height, in->linesize[0],
> >> +                           out->data[0]+out->linesize[0]*out->height,
> >> out->width, out->height, out->linesize[0],
> >> +                           s->planes_in[1].left, s->planes_in[1].top,
> 1);
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0]+in->linesize[0]*in->height*2,
> >> in->width, in->height, in->linesize[0],
> >> +                           out->data[0]+out->linesize[0]*out->height*2,
> >> out->width, out->height, out->linesize[0],
> >> +                           s->planes_in[2].left, s->planes_in[2].top,
> 1);
> >> +        break;
> >> +    case AV_PIX_FMT_NV12:
> >> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> >> +                           in->data[0], in->width, in->height,
> >> in->linesize[0],
> >> +                           out->data[0], out->width, out->height,
> >> out->linesize[0],
> >> +                           s->planes_in[0].left, s->planes_in[0].top,
> 1);
> >> +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,
> >> +                           in->data[1], in->width/2, in->height/2,
> >> in->linesize[1],
> >> +                           out->data[0] + out->linesize[0] *
> >> ((out->height + 31) & ~0x1f), out->width/2, out->height/2,
> >> out->linesize[1]/2,
> >> +                           s->planes_in[1].left, s->planes_in[1].top,
> 1);
> >> +        break;
> >> +    case AV_PIX_FMT_P010LE:
> >> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> >> +                           in->data[0], in->width, in->height,
> >> in->linesize[0]/2,
> >> +                           out->data[0], out->width, out->height,
> >> out->linesize[0]/2,
> >> +                           s->planes_in[0].left, s->planes_in[0].top,
> 2);
> >> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> >> +                           in->data[1], in->width / 2, in->height / 2,
> >> in->linesize[1]/2,
> >> +                           out->data[0] + out->linesize[0] *
> >> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> >> out->linesize[1] / 4,
> >> +                           s->planes_in[1].left, s->planes_in[1].top,
> 2);
> >> +        break;
> >> +    case AV_PIX_FMT_P016LE:
> >> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> >> +                           in->data[0], in->width, in->height,
> >> in->linesize[0] / 2,
> >> +                           out->data[0], out->width, out->height,
> >> out->linesize[0] / 2,
> >> +                           s->planes_in[0].left, s->planes_in[0].top,
> 2);
> >> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> >> +                           in->data[1], in->width / 2, in->height / 2,
> >> in->linesize[1] / 2,
> >> +                           out->data[0] + out->linesize[0] *
> >> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> >> out->linesize[1] / 4,
> >> +                           s->planes_in[1].left, s->planes_in[1].top,
> 2);
> >> +        break;
> >> +    default:
> >> +        return AVERROR_BUG;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame
> *in)
> >> +{
> >> +    CUDACropContext *s = ctx->priv;
> >> +    AVFrame *src = in;
> >> +    int ret;
> >> +
> >> +    ret = cropcuda_crop_internal(ctx, s->frame, src);
> >> +    if (ret < 0)
> >> +        return ret;
> >> +
> >> +    src = s->frame;
> >> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> >> +    if (ret < 0)
> >> +        return ret;
> >> +
> >> +    av_frame_move_ref(out, s->frame);
> >> +    av_frame_move_ref(s->frame, s->tmp_frame);
> >> +
> >> +    ret = av_frame_copy_props(out, in);
> >> +    if (ret < 0)
> >> +        return ret;
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)
> >> +{
> >> +    AVFilterContext              *ctx = link->dst;
> >> +    CUDACropContext              *s = ctx->priv;
> >> +    AVFilterLink             *outlink = ctx->outputs[0];
> >> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> >> +
> >> +    AVFrame *out = NULL;
> >> +    CUcontext dummy;
> >> +    int ret = 0;
> >> +
> >> +    out = av_frame_alloc();
> >> +    if (!out) {
> >> +        ret = AVERROR(ENOMEM);
> >> +        goto fail;
> >> +    }
> >> +
> >> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    ret = cudacrop_crop(ctx, out, in);
> >> +
> >> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> >> +    if (ret < 0)
> >> +        goto fail;
> >> +
> >> +    av_reduce(&out->sample_aspect_ratio.num,
> >> &out->sample_aspect_ratio.den,
> >> +              (int64_t)in->sample_aspect_ratio.num * outlink->h *
> link->w,
> >> +              (int64_t)in->sample_aspect_ratio.den * outlink->w *
> link->h,
> >> +              INT_MAX);
> >> +
> >> +    av_frame_free(&in);
> >> +    return ff_filter_frame(outlink, out);
> >> +fail:
> >> +    av_frame_free(&in);
> >> +    av_frame_free(&out);
> >> +    return ret;
> >> +}
> >> +
> >> +#define OFFSET(x) offsetof(CUDACropContext, x)
> >> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
> >> +static const AVOption options[] = {
> >> +    { "w",      "set the width crop area expression",  OFFSET(w_expr),
> >> AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags = FLAGS },
> >> +    { "h",      "set the height crop area expression", OFFSET(h_expr),
> >> AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags = FLAGS },
> >> +    { "x",      "set the x crop area expression",      OFFSET(x_expr),
> >> AV_OPT_TYPE_STRING, { .str = "(in_w-out_w)/2"}, .flags = FLAGS },
> >> +    { "y",      "set the y crop area expression",      OFFSET(y_expr),
> >> AV_OPT_TYPE_STRING, { .str = "(in_h-out_h)/2"}, .flags = FLAGS },
> >> +    { NULL },
> >> +};
> >> +
> >> +static const AVClass cudacrop_class = {
> >> +    .class_name = "cudacrop",
> >> +    .item_name  = av_default_item_name,
> >> +    .option     = options,
> >> +    .version    = LIBAVUTIL_VERSION_INT,
> >> +};
> >> +
> >> +static const AVFilterPad cudacrop_inputs[] = {
> >> +    {
> >> +        .name        = "default",
> >> +        .type        = AVMEDIA_TYPE_VIDEO,
> >> +        .filter_frame = cudacrop_filter_frame,
> >> +        .config_props = cudacrop_config_input,
> >> +    },
> >> +    { NULL }
> >> +};
> >> +
> >> +static const AVFilterPad cudacrop_outputs[] = {
> >> +    {
> >> +        .name         = "default",
> >> +        .type         = AVMEDIA_TYPE_VIDEO,
> >> +        .config_props = cudacrop_config_output,
> >> +    },
> >> +    { NULL }
> >> +};
> >> +
> >> +AVFilter ff_vf_crop_cuda = {
> >> +    .name      = "crop_cuda",
> >> +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video crop"),
> >> +
> >> +    .init          = cudacrop_init,
> >> +    .uninit        = cudacrop_uninit,
> >> +    .query_formats = cudacrop_query_formats,
> >> +
> >> +    .priv_size = sizeof(CUDACropContext),
> >> +    .priv_class = &cudacrop_class,
> >> +
> >> +    .inputs    = cudacrop_inputs,
> >> +    .outputs   = cudacrop_outputs,
> >> +
> >> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> >> +};
> >> diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu
> >> new file mode 100644
> >> index 0000000..4b94b73
> >> --- /dev/null
> >> +++ b/libavfilter/vf_crop_cuda.cu
> >> @@ -0,0 +1,109 @@
> >> +/*
> >> + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> >> + *
> >> + * Permission is hereby granted, free of charge, to any person
> obtaining a
> >> + * copy of this software and associated documentation files (the
> >> "Software"),
> >> + * to deal in the Software without restriction, including without
> >> limitation
> >> + * the rights to use, copy, modify, merge, publish, distribute,
> >> sublicense,
> >> + * and/or sell copies of the Software, and to permit persons to whom
> the
> >> + * Software is furnished to do so, subject to the following conditions:
> >> + *
> >> + * The above copyright notice and this permission notice shall be
> >> included in
> >> + * all copies or substantial portions of the Software.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> >> EXPRESS OR
> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> >> MERCHANTABILITY,
> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> >> SHALL
> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> >> OTHER
> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
> >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> >> + * DEALINGS IN THE SOFTWARE.
> >> + */
> >> +
> >> +extern "C" {
> >> +
> >> +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,
> >> +                           unsigned char *dst,
> >> +                           int dst_width, int dst_height, int
> dst_pitch,
> >> +                           int left, int top)
> >> +{
> >> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> >> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> >> +    int xi = xo + left;
> >> +    int yi = yo + top;
> >> +
> >> +    if (yo < dst_height && xo < dst_width)
> >> +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned
> >> char>(uchar_tex, xi, yi);
> >> +}
> >> +
> >> +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,
> >> +                            uchar2 *dst,
> >> +                            int dst_width, int dst_height, int
> dst_pitch,
> >> +                            int left, int top)
> >> +{
> >> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> >> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> >> +    int xi = xo + left;
> >> +    int yi = yo + top;
> >> +
> >> +    if (yo < dst_height && xo < dst_width)
> >> +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex, xi,
> yi);
> >> +}
> >> +
> >> +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,
> >> +                            uchar4 *dst,
> >> +                            int dst_width, int dst_height, int
> dst_pitch,
> >> +                            int left, int top)
> >> +{
> >> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> >> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> >> +    int xi = xo + left;
> >> +    int yi = yo + top;
> >> +
> >> +    if (yo < dst_height && xo < dst_width)
> >> +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex, xi,
> yi);
> >> +}
> >> +
> >> +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,
> >> +                            unsigned short *dst,
> >> +                            int dst_width, int dst_height, int
> dst_pitch,
> >> +                            int left, int top)
> >> +{
> >> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> >> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> >> +    int xi = xo + left;
> >> +    int yi = yo + top;
> >> +
> >> +    if (yo < dst_height && xo < dst_width)
> >> +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned
> >> short>(ushort_tex, xi, yi);
> >> +}
> >> +
> >> +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,
> >> +                             ushort2 *dst,
> >> +                             int dst_width, int dst_height, int
> dst_pitch,
> >> +                             int left, int top)
> >> +{
> >> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> >> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> >> +    int xi = xo + left;
> >> +    int yi = yo + top;
> >> +
> >> +    if (yo < dst_height && xo < dst_width)
> >> +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex,
> xi,
> >> yi);
> >> +}
> >> +
> >> +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,
> >> +                             ushort4 *dst,
> >> +                             int dst_width, int dst_height, int
> dst_pitch,
> >> +                             int left, int top)
> >> +{
> >> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> >> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> >> +    int xi = xo + left;
> >> +    int yi = yo + top;
> >> +
> >> +    if (yo < dst_height && xo < dst_width)
> >> +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex,
> xi,
> >> yi);
> >> +}
> >> +
> >> +}
> >> --
> >> 1.8.3.1
> >>
> >>
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
> Thanks
> Steven
>
>
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Ruiling Song March 24, 2019, 1:28 p.m. UTC | #4
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> UsingtcNower

> Sent: Saturday, March 23, 2019 11:51 PM

> To: ffmpeg-devel@ffmpeg.org

> Subject: [FFmpeg-devel] [PATCH][FFmpeg-devel v2] Add GPU accelerated video

> crop filter

> 

> Signed-off-by: UsingtcNower <nowerzt@gmail.com>

> ---

>  Changelog                   |   1 +

>  configure                   |   1 +

>  doc/filters.texi            |  31 +++

>  libavfilter/Makefile        |   1 +

>  libavfilter/allfilters.c    |   1 +

>  libavfilter/version.h       |   2 +-

>  libavfilter/vf_crop_cuda.c  | 638

> ++++++++++++++++++++++++++++++++++++++++++++

>  libavfilter/vf_crop_cuda.cu | 109 ++++++++

>  8 files changed, 783 insertions(+), 1 deletion(-)

>  create mode 100644 libavfilter/vf_crop_cuda.c

>  create mode 100644 libavfilter/vf_crop_cuda.cu

> 

> diff --git a/Changelog b/Changelog

> index ad7e82f..f224fc8 100644

> --- a/Changelog

> +++ b/Changelog

> @@ -20,6 +20,7 @@ version <next>:

>  - libaribb24 based ARIB STD-B24 caption support (profiles A and C)

>  - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec

>  - removed libndi-newtek

> +- crop_cuda GPU accelerated video crop filter

> 

> 

>  version 4.1:

> diff --git a/configure b/configure

> index 331393f..3f3ac2f 100755

> --- a/configure

> +++ b/configure

> @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"

>  vaapi_encode_deps="vaapi"

>  v4l2_m2m_deps="linux_videodev2_h sem_timedwait"

> 

> +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"

Seems you are using NAN, you may also need to check dependency against "const_nan"

>  hwupload_cuda_filter_deps="ffnvcodec"

>  scale_npp_filter_deps="ffnvcodec libnpp"

>  scale_cuda_filter_deps="ffnvcodec cuda_nvcc"

> diff --git a/doc/filters.texi b/doc/filters.texi

> index 4ffb392..ee16a2d 100644

> --- a/doc/filters.texi

> +++ b/doc/filters.texi

> @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it is kept at its

> current

>  value.

>  @end table

> 

> +@section crop_cuda

> +

> +Crop the input video to given dimensions, implemented in CUDA.

> +

> +It accepts the following parameters:

> +

> +@table @option

> +

> +@item w

> +The width of the output video. It defaults to @code{iw}.

> +This expression is evaluated only once during the filter

> +configuration.

> +

> +@item h

> +The height of the output video. It defaults to @code{ih}.

> +This expression is evaluated only once during the filter

> +configuration.

> +

> +@item x

> +The horizontal position, in the input video, of the left edge of the output

> +video. It defaults to @code{(in_w-out_w)/2}.

> +This expression is evaluated only once during the filter

> +configuration.

> +

> +@item y

> +The vertical position, in the input video, of the top edge of the output video.

> +It defaults to @code{(in_h-out_h)/2}.

> +This expression is evaluated only once during the filter

> +configuration.

> +@end table

> +

>  @section cropdetect

> 

>  Auto-detect the crop size.

> diff --git a/libavfilter/Makefile b/libavfilter/Makefile

> index fef6ec5..84df037 100644

> --- a/libavfilter/Makefile

> +++ b/libavfilter/Makefile

> @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   += vf_copy.o

>  OBJS-$(CONFIG_COREIMAGE_FILTER)              += vf_coreimage.o

>  OBJS-$(CONFIG_COVER_RECT_FILTER)             += vf_cover_rect.o lavfutils.o

>  OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o

> +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o

> vf_crop_cuda.ptx.o

>  OBJS-$(CONFIG_CROPDETECT_FILTER)             += vf_cropdetect.o

>  OBJS-$(CONFIG_CUE_FILTER)                    += f_cue.o

>  OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o

> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c

> index c51ae0f..550e545 100644

> --- a/libavfilter/allfilters.c

> +++ b/libavfilter/allfilters.c

> @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;

>  extern AVFilter ff_vf_coreimage;

>  extern AVFilter ff_vf_cover_rect;

>  extern AVFilter ff_vf_crop;

> +extern AVFilter ff_vf_crop_cuda;

>  extern AVFilter ff_vf_cropdetect;

>  extern AVFilter ff_vf_cue;

>  extern AVFilter ff_vf_curves;

> diff --git a/libavfilter/version.h b/libavfilter/version.h

> index c71282c..5aa95f4 100644

> --- a/libavfilter/version.h

> +++ b/libavfilter/version.h

> @@ -31,7 +31,7 @@

> 

>  #define LIBAVFILTER_VERSION_MAJOR   7

>  #define LIBAVFILTER_VERSION_MINOR  48

> -#define LIBAVFILTER_VERSION_MICRO 100

> +#define LIBAVFILTER_VERSION_MICRO 101

> 

>  #define LIBAVFILTER_VERSION_INT

> AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \

>                                                 LIBAVFILTER_VERSION_MINOR, \

> diff --git a/libavfilter/vf_crop_cuda.c b/libavfilter/vf_crop_cuda.c

> new file mode 100644

> index 0000000..fc6a2a6

> --- /dev/null

> +++ b/libavfilter/vf_crop_cuda.c

> @@ -0,0 +1,638 @@

> +/*

> +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.

> +*

> +* Permission is hereby granted, free of charge, to any person obtaining a

> +* copy of this software and associated documentation files (the "Software"),

> +* to deal in the Software without restriction, including without limitation

> +* the rights to use, copy, modify, merge, publish, distribute, sublicense,

> +* and/or sell copies of the Software, and to permit persons to whom the

> +* Software is furnished to do so, subject to the following conditions:

> +*

> +* The above copyright notice and this permission notice shall be included in

> +* all copies or substantial portions of the Software.

> +*

> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

> EXPRESS OR

> +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

> MERCHANTABILITY,

> +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO

> EVENT SHALL

> +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,

> DAMAGES OR OTHER

> +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

> ARISING

> +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

> OTHER

> +* DEALINGS IN THE SOFTWARE.

> +*/

> +

> +#include <stdio.h>

> +#include <string.h>

> +

> +#include "libavutil/avstring.h"

> +#include "libavutil/common.h"

> +#include "libavutil/hwcontext.h"

> +#include "libavutil/hwcontext_cuda_internal.h"

> +#include "libavutil/cuda_check.h"

> +#include "libavutil/internal.h"

> +#include "libavutil/opt.h"

> +#include "libavutil/pixdesc.h"

> +#include "libavutil/eval.h"

> +

> +#include "avfilter.h"

> +#include "formats.h"

> +#include "internal.h"

> +#include "video.h"

> +

> +static const char *const var_names[] = {

> +    "in_w", "iw",   ///< width  of the input video

> +    "in_h", "ih",   ///< height of the input video

> +    "out_w", "ow",  ///< width  of the cropped video

> +    "out_h", "oh",  ///< height of the cropped video

> +    "x",

> +    "y",

> +    NULL

> +};

> +

> +enum var_name {

> +    VAR_IN_W,  VAR_IW,

> +    VAR_IN_H,  VAR_IH,

> +    VAR_OUT_W, VAR_OW,

> +    VAR_OUT_H, VAR_OH,

> +    VAR_X,

> +    VAR_Y,

> +    VAR_VARS_NB

> +};

> +

> +static const enum AVPixelFormat supported_formats[] = {

> +    AV_PIX_FMT_YUV420P,

> +    AV_PIX_FMT_NV12,

> +    AV_PIX_FMT_YUV444P,

> +    AV_PIX_FMT_P010,

> +    AV_PIX_FMT_P016

> +};

> +

> +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )

> +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))

FFALIGN() should do the same work.

> +#define NUM_BUFFERS 2

> +#define BLOCKX 32

> +#define BLOCKY 16

> +

> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)

> +

> +typedef struct CUDACropContext {

> +    const AVClass *class;

> +    AVCUDADeviceContext *hwctx;

> +    enum AVPixelFormat in_fmt;

> +    enum AVPixelFormat out_fmt;

> +

> +    struct {

> +        int width;

> +        int height;

> +        int left;

> +        int top;

> +    } planes_in[3], planes_out[3];

> +

> +    AVBufferRef *frames_ctx;

> +    AVFrame     *frame;

> +

> +    AVFrame *tmp_frame;

> +    int passthrough;

> +

> +    /**

> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.

> +     */

> +    enum AVPixelFormat format;

> +

> +    int w,h,x,y;

> +    char *w_expr, *h_expr, *x_expr, *y_expr;

> +    double var_values[VAR_VARS_NB];

> +

> +    CUcontext   cu_ctx;

> +    CUmodule    cu_module;

> +    CUfunction  cu_func_uchar;

> +    CUfunction  cu_func_uchar2;

> +    CUfunction  cu_func_uchar4;

> +    CUfunction  cu_func_ushort;

> +    CUfunction  cu_func_ushort2;

> +    CUfunction  cu_func_ushort4;

> +    CUstream    cu_stream;

> +

> +    CUdeviceptr srcBuffer;

> +    CUdeviceptr dstBuffer;

> +    int         tex_alignment;

> +} CUDACropContext;

> +

> +static av_cold int cudacrop_init(AVFilterContext *ctx)

> +{

> +    CUDACropContext *s = ctx->priv;

> +

> +    s->format = AV_PIX_FMT_NONE;

> +    s->frame = av_frame_alloc();

> +    if (!s->frame)

> +        return AVERROR(ENOMEM);

> +

> +    s->tmp_frame = av_frame_alloc();

> +    if (!s->tmp_frame)

> +        return AVERROR(ENOMEM);

> +

> +    return 0;

> +}

> +

> +static av_cold void cudacrop_uninit(AVFilterContext *ctx)

> +{

> +    CUDACropContext *s = ctx->priv;

> +

> +    av_frame_free(&s->frame);

> +    av_buffer_unref(&s->frames_ctx);

> +    av_frame_free(&s->tmp_frame);

> +}

> +

> +static int cudacrop_query_formats(AVFilterContext *ctx)

> +{

> +    static const enum AVPixelFormat pixel_formats[] = {

> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,

> +    };

> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);

> +

> +    return ff_set_common_formats(ctx, pix_fmts);

> +}

> +

> +static av_cold int init_stage(CUDACropContext *s, AVBufferRef *device_ctx)

> +{

> +    AVBufferRef *out_ref = NULL;

> +    AVHWFramesContext *out_ctx;

> +    int in_sw, in_sh, out_sw, out_sh;

> +    int ret, i;

> +

> +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);

> +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);

> +    if (!s->planes_out[0].width) {

> +        s->planes_out[0].width  = s->planes_in[0].width;

> +        s->planes_out[0].height = s->planes_in[0].height;

> +        s->planes_out[0].left = s->planes_in[0].left;

> +        s->planes_out[0].top  = s->planes_in[0].top;

> +    }

> +

> +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {

> +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;

> +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;

> +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;

> +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;

> +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;

> +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;

> +        s->planes_out[i].left   = 0;

> +        s->planes_out[i].top    = 0;

> +

> +    }

> +

> +    out_ref = av_hwframe_ctx_alloc(device_ctx);

> +    if (!out_ref)

> +        return AVERROR(ENOMEM);

> +    out_ctx = (AVHWFramesContext*)out_ref->data;

> +

> +    out_ctx->format    = AV_PIX_FMT_CUDA;

> +    out_ctx->sw_format = s->out_fmt;

> +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);

> +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);

> +

> +    ret = av_hwframe_ctx_init(out_ref);

> +    if (ret < 0)

> +        goto fail;

> +

> +    av_frame_unref(s->frame);

> +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);

> +    if (ret < 0)

> +        goto fail;

> +

> +    s->frame->width  = s->planes_out[0].width;

> +    s->frame->height = s->planes_out[0].height;

> +

> +    av_buffer_unref(&s->frames_ctx);

> +    s->frames_ctx = out_ref;

> +

> +    return 0;

> +fail:

> +    av_buffer_unref(&out_ref);

> +    return ret;

> +}

> +

> +static int format_is_supported(enum AVPixelFormat fmt)

> +{

> +    int i;

> +

> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)

> +        if (supported_formats[i] == fmt)

> +            return 1;

> +    return 0;

> +}

> +

> +static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int

> in_height,

> +                                         int out_width, int out_height,

> +                                         int left, int top)

> +{

> +    CUDACropContext *s = ctx->priv;

> +

> +    AVHWFramesContext *in_frames_ctx;

> +

> +    enum AVPixelFormat in_format;

> +    enum AVPixelFormat out_format;

> +    int ret;

> +

> +    /* check that we have a hw context */

> +    if (!ctx->inputs[0]->hw_frames_ctx) {

> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");

> +        return AVERROR(EINVAL);

> +    }

> +    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx-

> >data;

> +    in_format     = in_frames_ctx->sw_format;

> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format;

> +

> +    if (!format_is_supported(in_format)) {

> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",

> +               av_get_pix_fmt_name(in_format));

> +        return AVERROR(ENOSYS);

> +    }

> +    if (!format_is_supported(out_format)) {

> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",

> +               av_get_pix_fmt_name(out_format));

> +        return AVERROR(ENOSYS);

> +    }

> +

> +    if (in_width == out_width && in_height == out_height)

> +        s->passthrough = 1;

You check for the passthrough, but seems not implement passthrough logic?
I think you can either implement passthrough logic, or totally remove the passthrough code.
I am not sure the passthrough code in transpose_opencl could give you some help.
I am not familiar with cuda, so I don't have comment on cuda code.

Thanks!
Ruiling
> +

> +    s->in_fmt = in_format;

> +    s->out_fmt = out_format;

> +

> +    s->planes_in[0].width   = in_width;

> +    s->planes_in[0].height  = in_height;

> +    s->planes_in[0].left = left;

> +    s->planes_in[0].top = top;

> +    s->planes_out[0].width  = out_width;

> +    s->planes_out[0].height = out_height;

> +    s->planes_out[0].left = 0;

> +    s->planes_out[0].top = 0;

> +

> +    ret = init_stage(s, in_frames_ctx->device_ref);

> +    if (ret < 0)

> +        return ret;

> +

> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);

> +    if (!ctx->outputs[0]->hw_frames_ctx)

> +        return AVERROR(ENOMEM);

> +

> +    return 0;

> +}

> +

> +static inline int normalize_double(int *n, double d)

> +{

> +    int ret = 0;

> +

> +    if (isnan(d))

> +        ret = AVERROR(EINVAL);

> +    else if (d > INT_MAX || d < INT_MIN) {

> +        *n = d > INT_MAX ? INT_MAX : INT_MIN;

> +        ret = AVERROR(EINVAL);

> +    } else

> +        *n = lrint(d);

> +

> +    return ret;

> +}

> +

> +static av_cold int cudacrop_config_input(AVFilterLink *inlink)

> +{

> +    AVFilterContext *ctx = inlink->dst;

> +    CUDACropContext *s = ctx->priv;

> +    double res;

> +    int ret;

> +

> +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;

> +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;

> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;

> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;

> +    s->var_values[VAR_X] = NAN;

> +    s->var_values[VAR_Y] = NAN;

> +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,

> +                                      var_names, s->var_values,

> +                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)

> +        goto fail;

> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;

> +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,

> +                                      var_names, s->var_values,

> +                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)

> +        goto fail;

> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;

> +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,

> +                                      var_names, s->var_values,

> +                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)

> +        goto fail;

> +    s->var_values[VAR_X] = res;

> +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,

> +                                      var_names, s->var_values,

> +                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)

> +        goto fail;

> +    s->var_values[VAR_Y] = res;

> +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||

> +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||

> +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||

> +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {

> +        av_log(ctx, AV_LOG_ERROR,

> +               "Too big value or invalid expression for out_w/ow or out_h/oh or x or

> y");

> +        return AVERROR(EINVAL);

> +    }

> +

> +fail:

> +    return ret;

> +}

> +

> +static av_cold int cudacrop_config_output(AVFilterLink *outlink)

> +{

> +    AVFilterContext *ctx = outlink->src;

> +    AVFilterLink *inlink = outlink->src->inputs[0];

> +    CUDACropContext *s  = ctx->priv;

> +    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)inlink-

> >hw_frames_ctx->data;

> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;

> +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;

> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;

> +    int ret;

> +

> +    extern char vf_crop_cuda_ptx[];

> +

> +    s->hwctx = device_hwctx;

> +    s->cu_stream = s->hwctx->stream;

> +

> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));

> +    if (ret < 0)

> +        goto fail;

> +

> +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module,

> vf_crop_cuda_ptx));

> +    if (ret < 0)

> +        goto fail;

> +

> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module,

> "Crop_uchar"));

> +    if (ret < 0)

> +        goto fail;

> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module,

> "Crop_uchar2"));

> +    if (ret < 0)

> +        goto fail;

> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module,

> "Crop_uchar4"));

> +    if (ret < 0)

> +        goto fail;

> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module,

> "Crop_ushort"));

> +    if (ret < 0)

> +        goto fail;

> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module,

> "Crop_ushort2"));

> +    if (ret < 0)

> +        goto fail;

> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module,

> "Crop_ushort4"));

> +    if (ret < 0)

> +        goto fail;

> +

> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));

> +

> +    outlink->w = s->w;

> +    outlink->h = s->h;

> +

> +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w, s->h, s->x, s->y);

> +    if (ret < 0)

> +        return ret;

> +

> +    if (inlink->sample_aspect_ratio.num) {

> +        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink-

> >w,

> +                                                             outlink->w*inlink->h},

> +                                                inlink->sample_aspect_ratio);

> +    } else {

> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;

> +    }

> +

> +    return 0;

> +

> +fail:

> +    return ret;

> +}

> +

> +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func, int channels,

> +                              uint8_t *src_dptr, int src_width, int src_height, int src_pitch,

> +                              uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,

> +                              int left, int top, int pixel_size)

> +{

> +    CUDACropContext *s = ctx->priv;

> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;

> +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;

> +    CUtexObject tex = 0;

> +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height,

> &dst_pitch, &left, &top };

> +    int ret;

> +

> +    CUDA_TEXTURE_DESC tex_desc = {

> +        .filterMode = CU_TR_FILTER_MODE_LINEAR,

> +        .flags = CU_TRSF_READ_AS_INTEGER,

> +    };

> +

> +    CUDA_RESOURCE_DESC res_desc = {

> +        .resType = CU_RESOURCE_TYPE_PITCH2D,

> +        .res.pitch2D.format = pixel_size == 1 ?

> +                              CU_AD_FORMAT_UNSIGNED_INT8 :

> +                              CU_AD_FORMAT_UNSIGNED_INT16,

> +        .res.pitch2D.numChannels = channels,

> +        .res.pitch2D.width = src_width,

> +        .res.pitch2D.height = src_height,

> +        .res.pitch2D.pitchInBytes = src_pitch,

> +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,

> +    };

> +

> +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,

> NULL));

> +    if (ret < 0)

> +        goto exit;

> +

> +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX),

> DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,

> args_uchar, NULL));

> +

> +exit:

> +    if (tex)

> +        CHECK_CU(cu->cuTexObjectDestroy(tex));

> +    return ret;

> +}

> +

> +static int cropcuda_crop_internal(AVFilterContext *ctx,

> +                            AVFrame *out, AVFrame *in)

> +{

> +    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in-

> >hw_frames_ctx->data;

> +    CUDACropContext *s = ctx->priv;

> +

> +    switch (in_frames_ctx->sw_format) {

> +    case AV_PIX_FMT_YUV420P:

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0], in->width, in->height, in->linesize[0],

> +                           out->data[0], out->width, out->height, out->linesize[0],

> +                           s->planes_in[0].left, s->planes_in[0].top, 1);

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2,

> in->linesize[0]/2,

> +                           out->data[0]+out->linesize[0]*out->height, out->width/2, out-

> >height/2, out->linesize[0]/2,

> +                           s->planes_in[1].left, s->planes_in[1].top, 1);

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s-

> >tex_alignment), in->width/2, in->height/2, in->linesize[0]/2,

> +                           out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,

> out->height/2, out->linesize[0]/2,

> +                           s->planes_in[2].left, s->planes_in[2].top, 1);

> +        break;

> +    case AV_PIX_FMT_YUV444P:

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0], in->width, in->height, in->linesize[0],

> +                           out->data[0], out->width, out->height, out->linesize[0],

> +                           s->planes_in[0].left, s->planes_in[0].top, 1);

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0]+in->linesize[0]*in->height, in->width, in->height, in-

> >linesize[0],

> +                           out->data[0]+out->linesize[0]*out->height, out->width, out-

> >height, out->linesize[0],

> +                           s->planes_in[1].left, s->planes_in[1].top, 1);

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0]+in->linesize[0]*in->height*2, in->width, in->height,

> in->linesize[0],

> +                           out->data[0]+out->linesize[0]*out->height*2, out->width, out-

> >height, out->linesize[0],

> +                           s->planes_in[2].left, s->planes_in[2].top, 1);

> +        break;

> +    case AV_PIX_FMT_NV12:

> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> +                           in->data[0], in->width, in->height, in->linesize[0],

> +                           out->data[0], out->width, out->height, out->linesize[0],

> +                           s->planes_in[0].left, s->planes_in[0].top, 1);

> +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,

> +                           in->data[1], in->width/2, in->height/2, in->linesize[1],

> +                           out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f),

> out->width/2, out->height/2, out->linesize[1]/2,

> +                           s->planes_in[1].left, s->planes_in[1].top, 1);

> +        break;

> +    case AV_PIX_FMT_P010LE:

> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,

> +                           in->data[0], in->width, in->height, in->linesize[0]/2,

> +                           out->data[0], out->width, out->height, out->linesize[0]/2,

> +                           s->planes_in[0].left, s->planes_in[0].top, 2);

> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,

> +                           in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2,

> +                           out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f),

> out->width / 2, out->height / 2, out->linesize[1] / 4,

> +                           s->planes_in[1].left, s->planes_in[1].top, 2);

> +        break;

> +    case AV_PIX_FMT_P016LE:

> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,

> +                           in->data[0], in->width, in->height, in->linesize[0] / 2,

> +                           out->data[0], out->width, out->height, out->linesize[0] / 2,

> +                           s->planes_in[0].left, s->planes_in[0].top, 2);

> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,

> +                           in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2,

> +                           out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f),

> out->width / 2, out->height / 2, out->linesize[1] / 4,

> +                           s->planes_in[1].left, s->planes_in[1].top, 2);

> +        break;

> +    default:

> +        return AVERROR_BUG;

> +    }

> +

> +    return 0;

> +}

> +

> +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame *in)

> +{

> +    CUDACropContext *s = ctx->priv;

> +    AVFrame *src = in;

> +    int ret;

> +

> +    ret = cropcuda_crop_internal(ctx, s->frame, src);

> +    if (ret < 0)

> +        return ret;

> +

> +    src = s->frame;

> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);

> +    if (ret < 0)

> +        return ret;

> +

> +    av_frame_move_ref(out, s->frame);

> +    av_frame_move_ref(s->frame, s->tmp_frame);

> +

> +    ret = av_frame_copy_props(out, in);

> +    if (ret < 0)

> +        return ret;

> +

> +    return 0;

> +}

> +

> +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)

> +{

> +    AVFilterContext              *ctx = link->dst;

> +    CUDACropContext              *s = ctx->priv;

> +    AVFilterLink             *outlink = ctx->outputs[0];

> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;

> +

> +    AVFrame *out = NULL;

> +    CUcontext dummy;

> +    int ret = 0;

> +

> +    out = av_frame_alloc();

> +    if (!out) {

> +        ret = AVERROR(ENOMEM);

> +        goto fail;

> +    }

> +

> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));

> +    if (ret < 0)

> +        goto fail;

> +

> +    ret = cudacrop_crop(ctx, out, in);

> +

> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));

> +    if (ret < 0)

> +        goto fail;

> +

> +    av_reduce(&out->sample_aspect_ratio.num, &out-

> >sample_aspect_ratio.den,

> +              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,

> +              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,

> +              INT_MAX);

> +

> +    av_frame_free(&in);

> +    return ff_filter_frame(outlink, out);

> +fail:

> +    av_frame_free(&in);

> +    av_frame_free(&out);

> +    return ret;

> +}

> +

> +#define OFFSET(x) offsetof(CUDACropContext, x)

> +#define FLAGS

> (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)

> +static const AVOption options[] = {

> +    { "w",      "set the width crop area expression",  OFFSET(w_expr),

> AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags = FLAGS },

> +    { "h",      "set the height crop area expression", OFFSET(h_expr),

> AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags = FLAGS },

> +    { "x",      "set the x crop area expression",      OFFSET(x_expr),

> AV_OPT_TYPE_STRING, { .str = "(in_w-out_w)/2"}, .flags = FLAGS },

> +    { "y",      "set the y crop area expression",      OFFSET(y_expr),

> AV_OPT_TYPE_STRING, { .str = "(in_h-out_h)/2"}, .flags = FLAGS },

> +    { NULL },

> +};

> +

> +static const AVClass cudacrop_class = {

> +    .class_name = "cudacrop",

> +    .item_name  = av_default_item_name,

> +    .option     = options,

> +    .version    = LIBAVUTIL_VERSION_INT,

> +};

> +

> +static const AVFilterPad cudacrop_inputs[] = {

> +    {

> +        .name        = "default",

> +        .type        = AVMEDIA_TYPE_VIDEO,

> +        .filter_frame = cudacrop_filter_frame,

> +        .config_props = cudacrop_config_input,

> +    },

> +    { NULL }

> +};

> +

> +static const AVFilterPad cudacrop_outputs[] = {

> +    {

> +        .name         = "default",

> +        .type         = AVMEDIA_TYPE_VIDEO,

> +        .config_props = cudacrop_config_output,

> +    },

> +    { NULL }

> +};

> +

> +AVFilter ff_vf_crop_cuda = {

> +    .name      = "crop_cuda",

> +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video crop"),

> +

> +    .init          = cudacrop_init,

> +    .uninit        = cudacrop_uninit,

> +    .query_formats = cudacrop_query_formats,

> +

> +    .priv_size = sizeof(CUDACropContext),

> +    .priv_class = &cudacrop_class,

> +

> +    .inputs    = cudacrop_inputs,

> +    .outputs   = cudacrop_outputs,

> +

> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,

> +};

> diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu

> new file mode 100644

> index 0000000..4b94b73

> --- /dev/null

> +++ b/libavfilter/vf_crop_cuda.cu

> @@ -0,0 +1,109 @@

> +/*

> + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.

> + *

> + * Permission is hereby granted, free of charge, to any person obtaining a

> + * copy of this software and associated documentation files (the "Software"),

> + * to deal in the Software without restriction, including without limitation

> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,

> + * and/or sell copies of the Software, and to permit persons to whom the

> + * Software is furnished to do so, subject to the following conditions:

> + *

> + * The above copyright notice and this permission notice shall be included in

> + * all copies or substantial portions of the Software.

> + *

> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

> EXPRESS OR

> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

> MERCHANTABILITY,

> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO

> EVENT SHALL

> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,

> DAMAGES OR OTHER

> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

> ARISING

> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

> OTHER

> + * DEALINGS IN THE SOFTWARE.

> + */

> +

> +extern "C" {

> +

> +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,

> +                           unsigned char *dst,

> +                           int dst_width, int dst_height, int dst_pitch,

> +                           int left, int top)

> +{

> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> +    int xi = xo + left;

> +    int yi = yo + top;

> +

> +    if (yo < dst_height && xo < dst_width)

> +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned char>(uchar_tex,

> xi, yi);

> +}

> +

> +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,

> +                            uchar2 *dst,

> +                            int dst_width, int dst_height, int dst_pitch,

> +                            int left, int top)

> +{

> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> +    int xi = xo + left;

> +    int yi = yo + top;

> +

> +    if (yo < dst_height && xo < dst_width)

> +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex, xi, yi);

> +}

> +

> +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,

> +                            uchar4 *dst,

> +                            int dst_width, int dst_height, int dst_pitch,

> +                            int left, int top)

> +{

> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> +    int xi = xo + left;

> +    int yi = yo + top;

> +

> +    if (yo < dst_height && xo < dst_width)

> +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex, xi, yi);

> +}

> +

> +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,

> +                            unsigned short *dst,

> +                            int dst_width, int dst_height, int dst_pitch,

> +                            int left, int top)

> +{

> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> +    int xi = xo + left;

> +    int yi = yo + top;

> +

> +    if (yo < dst_height && xo < dst_width)

> +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned short>(ushort_tex,

> xi, yi);

> +}

> +

> +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,

> +                             ushort2 *dst,

> +                             int dst_width, int dst_height, int dst_pitch,

> +                             int left, int top)

> +{

> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> +    int xi = xo + left;

> +    int yi = yo + top;

> +

> +    if (yo < dst_height && xo < dst_width)

> +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex, xi, yi);

> +}

> +

> +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,

> +                             ushort4 *dst,

> +                             int dst_width, int dst_height, int dst_pitch,

> +                             int left, int top)

> +{

> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> +    int xi = xo + left;

> +    int yi = yo + top;

> +

> +    if (yo < dst_height && xo < dst_width)

> +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex, xi, yi);

> +}

> +

> +}

> --

> 1.8.3.1

> 

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Diego Felix de Souza via ffmpeg-devel March 25, 2019, 4:57 a.m. UTC | #5
On Sat, 23 Mar 2019 23:51:10 +0800
UsingtcNower <nowerzt@gmail.com> wrote:

> Signed-off-by: UsingtcNower <nowerzt@gmail.com>
> ---
>  Changelog                   |   1 +
>  configure                   |   1 +
>  doc/filters.texi            |  31 +++
>  libavfilter/Makefile        |   1 +
>  libavfilter/allfilters.c    |   1 +
>  libavfilter/version.h       |   2 +-
>  libavfilter/vf_crop_cuda.c  | 638
> ++++++++++++++++++++++++++++++++++++++++++++
> libavfilter/vf_crop_cuda.cu | 109 ++++++++ 8 files changed, 783
> insertions(+), 1 deletion(-) create mode 100644
> libavfilter/vf_crop_cuda.c create mode 100644
> libavfilter/vf_crop_cuda.cu
> 
> diff --git a/Changelog b/Changelog
> index ad7e82f..f224fc8 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -20,6 +20,7 @@ version <next>:
>  - libaribb24 based ARIB STD-B24 caption support (profiles A and C)
>  - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec
>  - removed libndi-newtek
> +- crop_cuda GPU accelerated video crop filter

Hi,

Timo and Mark and I have been discussing this, and we think the right
thing to do is add support to vf_scale_cuda to respect the crop
properties on an input AVFrame. Mark posted a patch to vf_crop to
ensure that the properties are set, and then the scale filter should
respect those properties if they are set. You can look at
vf_scale_vaapi for how the properties are read, but they will require
explicit handling to adjust the src dimensions passed to the scale
filter.

This will be a more efficient way of handling crops, in terms of total
lines of code and also allowing crop/scale with one less copy.

I know this is quite different from the approach you've taken here, and
we appreciate the work you've done, but it should be better overall to
implement this integrated method.

Thanks,

>  
>  version 4.1:
> diff --git a/configure b/configure
> index 331393f..3f3ac2f 100755
> --- a/configure
> +++ b/configure
> @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"
>  vaapi_encode_deps="vaapi"
>  v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
>  
> +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"
>  hwupload_cuda_filter_deps="ffnvcodec"
>  scale_npp_filter_deps="ffnvcodec libnpp"
>  scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 4ffb392..ee16a2d 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it
> is kept at its current value.
>  @end table
>  
> +@section crop_cuda
> +
> +Crop the input video to given dimensions, implemented in CUDA.
> +
> +It accepts the following parameters:
> +
> +@table @option
> +
> +@item w
> +The width of the output video. It defaults to @code{iw}.
> +This expression is evaluated only once during the filter
> +configuration.
> +
> +@item h
> +The height of the output video. It defaults to @code{ih}.
> +This expression is evaluated only once during the filter
> +configuration.
> +
> +@item x
> +The horizontal position, in the input video, of the left edge of the
> output +video. It defaults to @code{(in_w-out_w)/2}.
> +This expression is evaluated only once during the filter
> +configuration.
> +
> +@item y
> +The vertical position, in the input video, of the top edge of the
> output video. +It defaults to @code{(in_h-out_h)/2}.
> +This expression is evaluated only once during the filter
> +configuration.
> +@end table
> +
>  @section cropdetect
>  
>  Auto-detect the crop size.
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index fef6ec5..84df037 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   +=
> vf_copy.o OBJS-$(CONFIG_COREIMAGE_FILTER)              +=
> vf_coreimage.o OBJS-$(CONFIG_COVER_RECT_FILTER)             +=
> vf_cover_rect.o lavfutils.o
> OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o
> +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o
> vf_crop_cuda.ptx.o OBJS-$(CONFIG_CROPDETECT_FILTER)             +=
> vf_cropdetect.o OBJS-$(CONFIG_CUE_FILTER)                    +=
> f_cue.o OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index c51ae0f..550e545 100644 --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;
>  extern AVFilter ff_vf_coreimage;
>  extern AVFilter ff_vf_cover_rect;
>  extern AVFilter ff_vf_crop;
> +extern AVFilter ff_vf_crop_cuda;
>  extern AVFilter ff_vf_cropdetect;
>  extern AVFilter ff_vf_cue;
>  extern AVFilter ff_vf_curves;
> diff --git a/libavfilter/version.h b/libavfilter/version.h
> index c71282c..5aa95f4 100644
> --- a/libavfilter/version.h
> +++ b/libavfilter/version.h
> @@ -31,7 +31,7 @@
>  
>  #define LIBAVFILTER_VERSION_MAJOR   7
>  #define LIBAVFILTER_VERSION_MINOR  48
> -#define LIBAVFILTER_VERSION_MICRO 100
> +#define LIBAVFILTER_VERSION_MICRO 101
>  
>  #define LIBAVFILTER_VERSION_INT
> AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
> LIBAVFILTER_VERSION_MINOR, \ diff --git a/libavfilter/vf_crop_cuda.c
> b/libavfilter/vf_crop_cuda.c new file mode 100644
> index 0000000..fc6a2a6
> --- /dev/null
> +++ b/libavfilter/vf_crop_cuda.c
> @@ -0,0 +1,638 @@
> +/*
> +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> +*
> +* Permission is hereby granted, free of charge, to any person
> obtaining a +* copy of this software and associated documentation
> files (the "Software"), +* to deal in the Software without
> restriction, including without limitation +* the rights to use, copy,
> modify, merge, publish, distribute, sublicense, +* and/or sell copies
> of the Software, and to permit persons to whom the +* Software is
> furnished to do so, subject to the following conditions: +*
> +* The above copyright notice and this permission notice shall be
> included in +* all copies or substantial portions of the Software.
> +*
> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND
> NONINFRINGEMENT.  IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT
> HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY,
> WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM,
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +*
> DEALINGS IN THE SOFTWARE. +*/
> +
> +#include <stdio.h>
> +#include <string.h>
> +
> +#include "libavutil/avstring.h"
> +#include "libavutil/common.h"
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#include "libavutil/cuda_check.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/eval.h"
> +
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +static const char *const var_names[] = {
> +    "in_w", "iw",   ///< width  of the input video
> +    "in_h", "ih",   ///< height of the input video
> +    "out_w", "ow",  ///< width  of the cropped video
> +    "out_h", "oh",  ///< height of the cropped video
> +    "x",
> +    "y",
> +    NULL
> +};
> +
> +enum var_name {
> +    VAR_IN_W,  VAR_IW,
> +    VAR_IN_H,  VAR_IH,
> +    VAR_OUT_W, VAR_OW,
> +    VAR_OUT_H, VAR_OH,
> +    VAR_X,
> +    VAR_Y,
> +    VAR_VARS_NB
> +};
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_NV12,
> +    AV_PIX_FMT_YUV444P,
> +    AV_PIX_FMT_P010,
> +    AV_PIX_FMT_P016
> +};
> +
> +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
> +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))
> +#define NUM_BUFFERS 2
> +#define BLOCKX 32
> +#define BLOCKY 16
> +
> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx,
> s->hwctx->internal->cuda_dl, x) +
> +typedef struct CUDACropContext {
> +    const AVClass *class;
> +    AVCUDADeviceContext *hwctx;
> +    enum AVPixelFormat in_fmt;
> +    enum AVPixelFormat out_fmt;
> +
> +    struct {
> +        int width;
> +        int height;
> +        int left;
> +        int top;
> +    } planes_in[3], planes_out[3];
> +
> +    AVBufferRef *frames_ctx;
> +    AVFrame     *frame;
> +
> +    AVFrame *tmp_frame;
> +    int passthrough;
> +
> +    /**
> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
> +     */
> +    enum AVPixelFormat format;
> +
> +    int w,h,x,y;
> +    char *w_expr, *h_expr, *x_expr, *y_expr;
> +    double var_values[VAR_VARS_NB];
> +
> +    CUcontext   cu_ctx;
> +    CUmodule    cu_module;
> +    CUfunction  cu_func_uchar;
> +    CUfunction  cu_func_uchar2;
> +    CUfunction  cu_func_uchar4;
> +    CUfunction  cu_func_ushort;
> +    CUfunction  cu_func_ushort2;
> +    CUfunction  cu_func_ushort4;
> +    CUstream    cu_stream;
> +
> +    CUdeviceptr srcBuffer;
> +    CUdeviceptr dstBuffer;
> +    int         tex_alignment;
> +} CUDACropContext;
> +
> +static av_cold int cudacrop_init(AVFilterContext *ctx)
> +{
> +    CUDACropContext *s = ctx->priv;
> +
> +    s->format = AV_PIX_FMT_NONE;
> +    s->frame = av_frame_alloc();
> +    if (!s->frame)
> +        return AVERROR(ENOMEM);
> +
> +    s->tmp_frame = av_frame_alloc();
> +    if (!s->tmp_frame)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static av_cold void cudacrop_uninit(AVFilterContext *ctx)
> +{
> +    CUDACropContext *s = ctx->priv;
> +
> +    av_frame_free(&s->frame);
> +    av_buffer_unref(&s->frames_ctx);
> +    av_frame_free(&s->tmp_frame);
> +}
> +
> +static int cudacrop_query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pixel_formats[] = {
> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
> +    };
> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
> +
> +    return ff_set_common_formats(ctx, pix_fmts);
> +}
> +
> +static av_cold int init_stage(CUDACropContext *s, AVBufferRef
> *device_ctx) +{
> +    AVBufferRef *out_ref = NULL;
> +    AVHWFramesContext *out_ctx;
> +    int in_sw, in_sh, out_sw, out_sh;
> +    int ret, i;
> +
> +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);
> +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);
> +    if (!s->planes_out[0].width) {
> +        s->planes_out[0].width  = s->planes_in[0].width;
> +        s->planes_out[0].height = s->planes_in[0].height;
> +        s->planes_out[0].left = s->planes_in[0].left;
> +        s->planes_out[0].top  = s->planes_in[0].top;
> +    }
> +
> +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {
> +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;
> +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;
> +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;
> +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;
> +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;
> +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;
> +        s->planes_out[i].left   = 0;
> +        s->planes_out[i].top    = 0;
> +
> +    }
> +
> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> +    if (!out_ref)
> +        return AVERROR(ENOMEM);
> +    out_ctx = (AVHWFramesContext*)out_ref->data;
> +
> +    out_ctx->format    = AV_PIX_FMT_CUDA;
> +    out_ctx->sw_format = s->out_fmt;
> +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);
> +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);
> +
> +    ret = av_hwframe_ctx_init(out_ref);
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_frame_unref(s->frame);
> +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
> +    if (ret < 0)
> +        goto fail;
> +
> +    s->frame->width  = s->planes_out[0].width;
> +    s->frame->height = s->planes_out[0].height;
> +
> +    av_buffer_unref(&s->frames_ctx);
> +    s->frames_ctx = out_ref;
> +
> +    return 0;
> +fail:
> +    av_buffer_unref(&out_ref);
> +    return ret;
> +}
> +
> +static int format_is_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +static av_cold int init_processing_chain(AVFilterContext *ctx, int
> in_width, int in_height,
> +                                         int out_width, int
> out_height,
> +                                         int left, int top)
> +{
> +    CUDACropContext *s = ctx->priv;
> +
> +    AVHWFramesContext *in_frames_ctx;
> +
> +    enum AVPixelFormat in_format;
> +    enum AVPixelFormat out_format;
> +    int ret;
> +
> +    /* check that we have a hw context */
> +    if (!ctx->inputs[0]->hw_frames_ctx) {
> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on
> input\n");
> +        return AVERROR(EINVAL);
> +    }
> +    in_frames_ctx =
> (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> +    in_format     = in_frames_ctx->sw_format;
> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format :
> s->format; +
> +    if (!format_is_supported(in_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
> +               av_get_pix_fmt_name(in_format));
> +        return AVERROR(ENOSYS);
> +    }
> +    if (!format_is_supported(out_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
> +               av_get_pix_fmt_name(out_format));
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    if (in_width == out_width && in_height == out_height)
> +        s->passthrough = 1;
> +
> +    s->in_fmt = in_format;
> +    s->out_fmt = out_format;
> +
> +    s->planes_in[0].width   = in_width;
> +    s->planes_in[0].height  = in_height;
> +    s->planes_in[0].left = left;
> +    s->planes_in[0].top = top;
> +    s->planes_out[0].width  = out_width;
> +    s->planes_out[0].height = out_height;
> +    s->planes_out[0].left = 0;
> +    s->planes_out[0].top = 0;
> +
> +    ret = init_stage(s, in_frames_ctx->device_ref);
> +    if (ret < 0)
> +        return ret;
> +
> +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
> +    if (!ctx->outputs[0]->hw_frames_ctx)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static inline int normalize_double(int *n, double d)
> +{
> +    int ret = 0;
> +
> +    if (isnan(d))
> +        ret = AVERROR(EINVAL);
> +    else if (d > INT_MAX || d < INT_MIN) {
> +        *n = d > INT_MAX ? INT_MAX : INT_MIN;
> +        ret = AVERROR(EINVAL);
> +    } else
> +        *n = lrint(d);
> +
> +    return ret;
> +}
> +
> +static av_cold int cudacrop_config_input(AVFilterLink *inlink)
> +{
> +    AVFilterContext *ctx = inlink->dst;
> +    CUDACropContext *s = ctx->priv;
> +    double res;
> +    int ret;
> +
> +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;
> +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;
> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;
> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;
> +    s->var_values[VAR_X] = NAN;
> +    s->var_values[VAR_Y] = NAN;
> +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL,
> 0, ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;
> +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL,
> 0, ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;
> +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL,
> 0, ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_X] = res;
> +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,
> +                                      var_names, s->var_values,
> +                                      NULL, NULL, NULL, NULL, NULL,
> 0, ctx)) < 0)
> +        goto fail;
> +    s->var_values[VAR_Y] = res;
> +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||
> +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||
> +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||
> +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {
> +        av_log(ctx, AV_LOG_ERROR,
> +               "Too big value or invalid expression for out_w/ow or
> out_h/oh or x or y");
> +        return AVERROR(EINVAL);
> +    }
> +
> +fail:
> +    return ret;
> +}
> +
> +static av_cold int cudacrop_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *ctx = outlink->src;
> +    AVFilterLink *inlink = outlink->src->inputs[0];
> +    CUDACropContext *s  = ctx->priv;
> +    AVHWFramesContext     *frames_ctx =
> (AVHWFramesContext*)inlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext *device_hwctx =
> frames_ctx->device_ctx->hwctx;
> +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
> +    int ret;
> +
> +    extern char vf_crop_cuda_ptx[];
> +
> +    s->hwctx = device_hwctx;
> +    s->cu_stream = s->hwctx->stream;
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module,
> vf_crop_cuda_ptx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar,
> s->cu_module, "Crop_uchar"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2,
> s->cu_module, "Crop_uchar2"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4,
> s->cu_module, "Crop_uchar4"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort,
> s->cu_module, "Crop_ushort"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2,
> s->cu_module, "Crop_ushort2"));
> +    if (ret < 0)
> +        goto fail;
> +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4,
> s->cu_module, "Crop_ushort4"));
> +    if (ret < 0)
> +        goto fail;
> +
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +
> +    outlink->w = s->w;
> +    outlink->h = s->h;
> +
> +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w,
> s->h, s->x, s->y);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (inlink->sample_aspect_ratio.num) {
> +        outlink->sample_aspect_ratio =
> av_mul_q((AVRational){outlink->h*inlink->w,
> +
> outlink->w*inlink->h},
> +
> inlink->sample_aspect_ratio);
> +    } else {
> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> +    }
> +
> +    return 0;
> +
> +fail:
> +    return ret;
> +}
> +
> +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func,
> int channels,
> +                              uint8_t *src_dptr, int src_width, int
> src_height, int src_pitch,
> +                              uint8_t *dst_dptr, int dst_width, int
> dst_height, int dst_pitch,
> +                              int left, int top, int pixel_size)
> +{
> +    CUDACropContext *s = ctx->priv;
> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
> +    CUtexObject tex = 0;
> +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width,
> &dst_height, &dst_pitch, &left, &top };
> +    int ret;
> +
> +    CUDA_TEXTURE_DESC tex_desc = {
> +        .filterMode = CU_TR_FILTER_MODE_LINEAR,
> +        .flags = CU_TRSF_READ_AS_INTEGER,
> +    };
> +
> +    CUDA_RESOURCE_DESC res_desc = {
> +        .resType = CU_RESOURCE_TYPE_PITCH2D,
> +        .res.pitch2D.format = pixel_size == 1 ?
> +                              CU_AD_FORMAT_UNSIGNED_INT8 :
> +                              CU_AD_FORMAT_UNSIGNED_INT16,
> +        .res.pitch2D.numChannels = channels,
> +        .res.pitch2D.width = src_width,
> +        .res.pitch2D.height = src_height,
> +        .res.pitch2D.pitchInBytes = src_pitch,
> +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
> +    };
> +
> +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,
> NULL));
> +    if (ret < 0)
> +        goto exit;
> +    
> +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width,
> BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0,
> s->cu_stream, args_uchar, NULL)); + +exit:
> +    if (tex)
> +        CHECK_CU(cu->cuTexObjectDestroy(tex));
> +    return ret;
> +}
> +
> +static int cropcuda_crop_internal(AVFilterContext *ctx,
> +                            AVFrame *out, AVFrame *in)
> +{
> +    AVHWFramesContext *in_frames_ctx =
> (AVHWFramesContext*)in->hw_frames_ctx->data;
> +    CUDACropContext *s = ctx->priv;
> +
> +    switch (in_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_YUV420P:
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0],
> +                           out->data[0], out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[0].left,
> s->planes_in[0].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+in->linesize[0]*in->height,
> in->width/2, in->height/2, in->linesize[0]/2,
> +
> out->data[0]+out->linesize[0]*out->height, out->width/2,
> out->height/2, out->linesize[0]/2,
> +                           s->planes_in[1].left,
> s->planes_in[1].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+
> ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment),
> in->width/2, in->height/2, in->linesize[0]/2,
> +
> out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,
> out->height/2, out->linesize[0]/2,
> +                           s->planes_in[2].left,
> s->planes_in[2].top, 1);
> +        break;
> +    case AV_PIX_FMT_YUV444P:
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0],
> +                           out->data[0], out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[0].left,
> s->planes_in[0].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+in->linesize[0]*in->height,
> in->width, in->height, in->linesize[0],
> +
> out->data[0]+out->linesize[0]*out->height, out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[1].left,
> s->planes_in[1].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0]+in->linesize[0]*in->height*2,
> in->width, in->height, in->linesize[0],
> +
> out->data[0]+out->linesize[0]*out->height*2, out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[2].left,
> s->planes_in[2].top, 1);
> +        break;
> +    case AV_PIX_FMT_NV12:
> +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0],
> +                           out->data[0], out->width, out->height,
> out->linesize[0],
> +                           s->planes_in[0].left,
> s->planes_in[0].top, 1);
> +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,
> +                           in->data[1], in->width/2, in->height/2,
> in->linesize[1],
> +                           out->data[0] + out->linesize[0] *
> ((out->height + 31) & ~0x1f), out->width/2, out->height/2,
> out->linesize[1]/2,
> +                           s->planes_in[1].left,
> s->planes_in[1].top, 1);
> +        break;
> +    case AV_PIX_FMT_P010LE:
> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0]/2,
> +                           out->data[0], out->width, out->height,
> out->linesize[0]/2,
> +                           s->planes_in[0].left,
> s->planes_in[0].top, 2);
> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> +                           in->data[1], in->width / 2, in->height /
> 2, in->linesize[1]/2,
> +                           out->data[0] + out->linesize[0] *
> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> out->linesize[1] / 4,
> +                           s->planes_in[1].left,
> s->planes_in[1].top, 2);
> +        break;
> +    case AV_PIX_FMT_P016LE:
> +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> +                           in->data[0], in->width, in->height,
> in->linesize[0] / 2,
> +                           out->data[0], out->width, out->height,
> out->linesize[0] / 2,
> +                           s->planes_in[0].left,
> s->planes_in[0].top, 2);
> +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> +                           in->data[1], in->width / 2, in->height /
> 2, in->linesize[1] / 2,
> +                           out->data[0] + out->linesize[0] *
> ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> out->linesize[1] / 4,
> +                           s->planes_in[1].left,
> s->planes_in[1].top, 2);
> +        break;
> +    default:
> +        return AVERROR_BUG;
> +    }
> +
> +    return 0;
> +}
> +
> +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame
> *in) +{
> +    CUDACropContext *s = ctx->priv;
> +    AVFrame *src = in;
> +    int ret;
> +
> +    ret = cropcuda_crop_internal(ctx, s->frame, src);
> +    if (ret < 0)
> +        return ret;
> +
> +    src = s->frame;
> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    av_frame_move_ref(out, s->frame);
> +    av_frame_move_ref(s->frame, s->tmp_frame);
> +
> +    ret = av_frame_copy_props(out, in);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext              *ctx = link->dst;
> +    CUDACropContext              *s = ctx->priv;
> +    AVFilterLink             *outlink = ctx->outputs[0];
> +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> +
> +    AVFrame *out = NULL;
> +    CUcontext dummy;
> +    int ret = 0;
> +
> +    out = av_frame_alloc();
> +    if (!out) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = cudacrop_crop(ctx, out, in);
> +
> +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_reduce(&out->sample_aspect_ratio.num,
> &out->sample_aspect_ratio.den,
> +              (int64_t)in->sample_aspect_ratio.num * outlink->h *
> link->w,
> +              (int64_t)in->sample_aspect_ratio.den * outlink->w *
> link->h,
> +              INT_MAX);
> +
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +fail:
> +    av_frame_free(&in);
> +    av_frame_free(&out);
> +    return ret;
> +}
> +
> +#define OFFSET(x) offsetof(CUDACropContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption options[] = {
> +    { "w",      "set the width crop area expression",
> OFFSET(w_expr),     AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags =
> FLAGS },
> +    { "h",      "set the height crop area expression",
> OFFSET(h_expr),     AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags =
> FLAGS },
> +    { "x",      "set the x crop area expression",
> OFFSET(x_expr),     AV_OPT_TYPE_STRING, { .str =
> "(in_w-out_w)/2"}, .flags = FLAGS },
> +    { "y",      "set the y crop area expression",
> OFFSET(y_expr),     AV_OPT_TYPE_STRING, { .str =
> "(in_h-out_h)/2"}, .flags = FLAGS },
> +    { NULL },
> +};
> +
> +static const AVClass cudacrop_class = {
> +    .class_name = "cudacrop",
> +    .item_name  = av_default_item_name,
> +    .option     = options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +static const AVFilterPad cudacrop_inputs[] = {
> +    {
> +        .name        = "default",
> +        .type        = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = cudacrop_filter_frame,
> +        .config_props = cudacrop_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad cudacrop_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = cudacrop_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_crop_cuda = {
> +    .name      = "crop_cuda",
> +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video
> crop"), +
> +    .init          = cudacrop_init,
> +    .uninit        = cudacrop_uninit,
> +    .query_formats = cudacrop_query_formats,
> +
> +    .priv_size = sizeof(CUDACropContext),
> +    .priv_class = &cudacrop_class,
> +
> +    .inputs    = cudacrop_inputs,
> +    .outputs   = cudacrop_outputs,
> +
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu
> new file mode 100644
> index 0000000..4b94b73
> --- /dev/null
> +++ b/libavfilter/vf_crop_cuda.cu
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person
> obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> the
> + * Software is furnished to do so, subject to the following
> conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
> OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +extern "C" {
> +
> +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,
> +                           unsigned char *dst,
> +                           int dst_width, int dst_height, int
> dst_pitch,
> +                           int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned
> char>(uchar_tex, xi, yi); +}
> +
> +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,
> +                            uchar2 *dst,
> +                            int dst_width, int dst_height, int
> dst_pitch,
> +                            int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex,
> xi, yi); +}
> +
> +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,
> +                            uchar4 *dst,
> +                            int dst_width, int dst_height, int
> dst_pitch,
> +                            int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex,
> xi, yi); +}
> +
> +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,
> +                            unsigned short *dst,
> +                            int dst_width, int dst_height, int
> dst_pitch,
> +                            int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned
> short>(ushort_tex, xi, yi); +}
> +
> +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,
> +                             ushort2 *dst,
> +                             int dst_width, int dst_height, int
> dst_pitch,
> +                             int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex,
> xi, yi); +}
> +
> +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,
> +                             ushort4 *dst,
> +                             int dst_width, int dst_height, int
> dst_pitch,
> +                             int left, int top)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +    int xi = xo + left;
> +    int yi = yo + top;
> +
> +    if (yo < dst_height && xo < dst_width)
> +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex,
> xi, yi); +}
> +
> +}




--phil
Ruiling Song March 25, 2019, 7:26 a.m. UTC | #6
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Philip Langdale via ffmpeg-devel

> Sent: Monday, March 25, 2019 12:57 PM

> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>

> Cc: Philip Langdale <philipl@overt.org>

> Subject: Re: [FFmpeg-devel] [PATCH][FFmpeg-devel v2] Add GPU accelerated

> video crop filter

> 

> On Sat, 23 Mar 2019 23:51:10 +0800

> UsingtcNower <nowerzt@gmail.com> wrote:

> 

> > Signed-off-by: UsingtcNower <nowerzt@gmail.com>

> > ---

> >  Changelog                   |   1 +

> >  configure                   |   1 +

> >  doc/filters.texi            |  31 +++

> >  libavfilter/Makefile        |   1 +

> >  libavfilter/allfilters.c    |   1 +

> >  libavfilter/version.h       |   2 +-

> >  libavfilter/vf_crop_cuda.c  | 638

> > ++++++++++++++++++++++++++++++++++++++++++++

> > libavfilter/vf_crop_cuda.cu | 109 ++++++++ 8 files changed, 783

> > insertions(+), 1 deletion(-) create mode 100644

> > libavfilter/vf_crop_cuda.c create mode 100644

> > libavfilter/vf_crop_cuda.cu

> >

> > diff --git a/Changelog b/Changelog

> > index ad7e82f..f224fc8 100644

> > --- a/Changelog

> > +++ b/Changelog

> > @@ -20,6 +20,7 @@ version <next>:

> >  - libaribb24 based ARIB STD-B24 caption support (profiles A and C)

> >  - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec

> >  - removed libndi-newtek

> > +- crop_cuda GPU accelerated video crop filter

> 

> Hi,

> 

> Timo and Mark and I have been discussing this, and we think the right

> thing to do is add support to vf_scale_cuda to respect the crop

> properties on an input AVFrame. Mark posted a patch to vf_crop to

> ensure that the properties are set, and then the scale filter should

> respect those properties if they are set. You can look at

> vf_scale_vaapi for how the properties are read, but they will require

> explicit handling to adjust the src dimensions passed to the scale

> filter.

> 

> This will be a more efficient way of handling crops, in terms of total

> lines of code and also allowing crop/scale with one less copy.

> 

> I know this is quite different from the approach you've taken here, and

> we appreciate the work you've done, but it should be better overall to

> implement this integrated method.

Hi Philip,

Glad to hear you guys had discussion on this. As I am also considering the problem, I have some questions about your idea.
So, what if user did not insert a scale_cuda after crop filter? Do you plan to automatically insert scale_cuda or just ignore the crop?
What if user want to do crop,transpose_cuda,scale_cuda? So we also need to handle crop inside transpose_cuda filter?
(looks like we do not have transpose_cuda right now, but this filter seems needed if user want to do transpose job using cuda.)

Thanks!
Ruiling
> 

> Thanks,

> 

> >

> >  version 4.1:

> > diff --git a/configure b/configure

> > index 331393f..3f3ac2f 100755

> > --- a/configure

> > +++ b/configure

> > @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"

> >  vaapi_encode_deps="vaapi"

> >  v4l2_m2m_deps="linux_videodev2_h sem_timedwait"

> >

> > +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"

> >  hwupload_cuda_filter_deps="ffnvcodec"

> >  scale_npp_filter_deps="ffnvcodec libnpp"

> >  scale_cuda_filter_deps="ffnvcodec cuda_nvcc"

> > diff --git a/doc/filters.texi b/doc/filters.texi

> > index 4ffb392..ee16a2d 100644

> > --- a/doc/filters.texi

> > +++ b/doc/filters.texi

> > @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it

> > is kept at its current value.

> >  @end table

> >

> > +@section crop_cuda

> > +

> > +Crop the input video to given dimensions, implemented in CUDA.

> > +

> > +It accepts the following parameters:

> > +

> > +@table @option

> > +

> > +@item w

> > +The width of the output video. It defaults to @code{iw}.

> > +This expression is evaluated only once during the filter

> > +configuration.

> > +

> > +@item h

> > +The height of the output video. It defaults to @code{ih}.

> > +This expression is evaluated only once during the filter

> > +configuration.

> > +

> > +@item x

> > +The horizontal position, in the input video, of the left edge of the

> > output +video. It defaults to @code{(in_w-out_w)/2}.

> > +This expression is evaluated only once during the filter

> > +configuration.

> > +

> > +@item y

> > +The vertical position, in the input video, of the top edge of the

> > output video. +It defaults to @code{(in_h-out_h)/2}.

> > +This expression is evaluated only once during the filter

> > +configuration.

> > +@end table

> > +

> >  @section cropdetect

> >

> >  Auto-detect the crop size.

> > diff --git a/libavfilter/Makefile b/libavfilter/Makefile

> > index fef6ec5..84df037 100644

> > --- a/libavfilter/Makefile

> > +++ b/libavfilter/Makefile

> > @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   +=

> > vf_copy.o OBJS-$(CONFIG_COREIMAGE_FILTER)              +=

> > vf_coreimage.o OBJS-$(CONFIG_COVER_RECT_FILTER)             +=

> > vf_cover_rect.o lavfutils.o

> > OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o

> > +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o

> > vf_crop_cuda.ptx.o OBJS-$(CONFIG_CROPDETECT_FILTER)             +=

> > vf_cropdetect.o OBJS-$(CONFIG_CUE_FILTER)                    +=

> > f_cue.o OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o

> > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c

> > index c51ae0f..550e545 100644 --- a/libavfilter/allfilters.c

> > +++ b/libavfilter/allfilters.c

> > @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;

> >  extern AVFilter ff_vf_coreimage;

> >  extern AVFilter ff_vf_cover_rect;

> >  extern AVFilter ff_vf_crop;

> > +extern AVFilter ff_vf_crop_cuda;

> >  extern AVFilter ff_vf_cropdetect;

> >  extern AVFilter ff_vf_cue;

> >  extern AVFilter ff_vf_curves;

> > diff --git a/libavfilter/version.h b/libavfilter/version.h

> > index c71282c..5aa95f4 100644

> > --- a/libavfilter/version.h

> > +++ b/libavfilter/version.h

> > @@ -31,7 +31,7 @@

> >

> >  #define LIBAVFILTER_VERSION_MAJOR   7

> >  #define LIBAVFILTER_VERSION_MINOR  48

> > -#define LIBAVFILTER_VERSION_MICRO 100

> > +#define LIBAVFILTER_VERSION_MICRO 101

> >

> >  #define LIBAVFILTER_VERSION_INT

> > AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \

> > LIBAVFILTER_VERSION_MINOR, \ diff --git a/libavfilter/vf_crop_cuda.c

> > b/libavfilter/vf_crop_cuda.c new file mode 100644

> > index 0000000..fc6a2a6

> > --- /dev/null

> > +++ b/libavfilter/vf_crop_cuda.c

> > @@ -0,0 +1,638 @@

> > +/*

> > +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.

> > +*

> > +* Permission is hereby granted, free of charge, to any person

> > obtaining a +* copy of this software and associated documentation

> > files (the "Software"), +* to deal in the Software without

> > restriction, including without limitation +* the rights to use, copy,

> > modify, merge, publish, distribute, sublicense, +* and/or sell copies

> > of the Software, and to permit persons to whom the +* Software is

> > furnished to do so, subject to the following conditions: +*

> > +* The above copyright notice and this permission notice shall be

> > included in +* all copies or substantial portions of the Software.

> > +*

> > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

> > EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES

> OF

> > MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND

> > NONINFRINGEMENT.  IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT

> > HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY,

> > WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*

> FROM,

> > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +*

> > DEALINGS IN THE SOFTWARE. +*/

> > +

> > +#include <stdio.h>

> > +#include <string.h>

> > +

> > +#include "libavutil/avstring.h"

> > +#include "libavutil/common.h"

> > +#include "libavutil/hwcontext.h"

> > +#include "libavutil/hwcontext_cuda_internal.h"

> > +#include "libavutil/cuda_check.h"

> > +#include "libavutil/internal.h"

> > +#include "libavutil/opt.h"

> > +#include "libavutil/pixdesc.h"

> > +#include "libavutil/eval.h"

> > +

> > +#include "avfilter.h"

> > +#include "formats.h"

> > +#include "internal.h"

> > +#include "video.h"

> > +

> > +static const char *const var_names[] = {

> > +    "in_w", "iw",   ///< width  of the input video

> > +    "in_h", "ih",   ///< height of the input video

> > +    "out_w", "ow",  ///< width  of the cropped video

> > +    "out_h", "oh",  ///< height of the cropped video

> > +    "x",

> > +    "y",

> > +    NULL

> > +};

> > +

> > +enum var_name {

> > +    VAR_IN_W,  VAR_IW,

> > +    VAR_IN_H,  VAR_IH,

> > +    VAR_OUT_W, VAR_OW,

> > +    VAR_OUT_H, VAR_OH,

> > +    VAR_X,

> > +    VAR_Y,

> > +    VAR_VARS_NB

> > +};

> > +

> > +static const enum AVPixelFormat supported_formats[] = {

> > +    AV_PIX_FMT_YUV420P,

> > +    AV_PIX_FMT_NV12,

> > +    AV_PIX_FMT_YUV444P,

> > +    AV_PIX_FMT_P010,

> > +    AV_PIX_FMT_P016

> > +};

> > +

> > +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )

> > +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))

> > +#define NUM_BUFFERS 2

> > +#define BLOCKX 32

> > +#define BLOCKY 16

> > +

> > +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx,

> > s->hwctx->internal->cuda_dl, x) +

> > +typedef struct CUDACropContext {

> > +    const AVClass *class;

> > +    AVCUDADeviceContext *hwctx;

> > +    enum AVPixelFormat in_fmt;

> > +    enum AVPixelFormat out_fmt;

> > +

> > +    struct {

> > +        int width;

> > +        int height;

> > +        int left;

> > +        int top;

> > +    } planes_in[3], planes_out[3];

> > +

> > +    AVBufferRef *frames_ctx;

> > +    AVFrame     *frame;

> > +

> > +    AVFrame *tmp_frame;

> > +    int passthrough;

> > +

> > +    /**

> > +     * Output sw format. AV_PIX_FMT_NONE for no conversion.

> > +     */

> > +    enum AVPixelFormat format;

> > +

> > +    int w,h,x,y;

> > +    char *w_expr, *h_expr, *x_expr, *y_expr;

> > +    double var_values[VAR_VARS_NB];

> > +

> > +    CUcontext   cu_ctx;

> > +    CUmodule    cu_module;

> > +    CUfunction  cu_func_uchar;

> > +    CUfunction  cu_func_uchar2;

> > +    CUfunction  cu_func_uchar4;

> > +    CUfunction  cu_func_ushort;

> > +    CUfunction  cu_func_ushort2;

> > +    CUfunction  cu_func_ushort4;

> > +    CUstream    cu_stream;

> > +

> > +    CUdeviceptr srcBuffer;

> > +    CUdeviceptr dstBuffer;

> > +    int         tex_alignment;

> > +} CUDACropContext;

> > +

> > +static av_cold int cudacrop_init(AVFilterContext *ctx)

> > +{

> > +    CUDACropContext *s = ctx->priv;

> > +

> > +    s->format = AV_PIX_FMT_NONE;

> > +    s->frame = av_frame_alloc();

> > +    if (!s->frame)

> > +        return AVERROR(ENOMEM);

> > +

> > +    s->tmp_frame = av_frame_alloc();

> > +    if (!s->tmp_frame)

> > +        return AVERROR(ENOMEM);

> > +

> > +    return 0;

> > +}

> > +

> > +static av_cold void cudacrop_uninit(AVFilterContext *ctx)

> > +{

> > +    CUDACropContext *s = ctx->priv;

> > +

> > +    av_frame_free(&s->frame);

> > +    av_buffer_unref(&s->frames_ctx);

> > +    av_frame_free(&s->tmp_frame);

> > +}

> > +

> > +static int cudacrop_query_formats(AVFilterContext *ctx)

> > +{

> > +    static const enum AVPixelFormat pixel_formats[] = {

> > +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,

> > +    };

> > +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);

> > +

> > +    return ff_set_common_formats(ctx, pix_fmts);

> > +}

> > +

> > +static av_cold int init_stage(CUDACropContext *s, AVBufferRef

> > *device_ctx) +{

> > +    AVBufferRef *out_ref = NULL;

> > +    AVHWFramesContext *out_ctx;

> > +    int in_sw, in_sh, out_sw, out_sh;

> > +    int ret, i;

> > +

> > +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);

> > +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);

> > +    if (!s->planes_out[0].width) {

> > +        s->planes_out[0].width  = s->planes_in[0].width;

> > +        s->planes_out[0].height = s->planes_in[0].height;

> > +        s->planes_out[0].left = s->planes_in[0].left;

> > +        s->planes_out[0].top  = s->planes_in[0].top;

> > +    }

> > +

> > +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {

> > +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;

> > +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;

> > +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;

> > +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;

> > +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;

> > +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;

> > +        s->planes_out[i].left   = 0;

> > +        s->planes_out[i].top    = 0;

> > +

> > +    }

> > +

> > +    out_ref = av_hwframe_ctx_alloc(device_ctx);

> > +    if (!out_ref)

> > +        return AVERROR(ENOMEM);

> > +    out_ctx = (AVHWFramesContext*)out_ref->data;

> > +

> > +    out_ctx->format    = AV_PIX_FMT_CUDA;

> > +    out_ctx->sw_format = s->out_fmt;

> > +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);

> > +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);

> > +

> > +    ret = av_hwframe_ctx_init(out_ref);

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    av_frame_unref(s->frame);

> > +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    s->frame->width  = s->planes_out[0].width;

> > +    s->frame->height = s->planes_out[0].height;

> > +

> > +    av_buffer_unref(&s->frames_ctx);

> > +    s->frames_ctx = out_ref;

> > +

> > +    return 0;

> > +fail:

> > +    av_buffer_unref(&out_ref);

> > +    return ret;

> > +}

> > +

> > +static int format_is_supported(enum AVPixelFormat fmt)

> > +{

> > +    int i;

> > +

> > +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)

> > +        if (supported_formats[i] == fmt)

> > +            return 1;

> > +    return 0;

> > +}

> > +

> > +static av_cold int init_processing_chain(AVFilterContext *ctx, int

> > in_width, int in_height,

> > +                                         int out_width, int

> > out_height,

> > +                                         int left, int top)

> > +{

> > +    CUDACropContext *s = ctx->priv;

> > +

> > +    AVHWFramesContext *in_frames_ctx;

> > +

> > +    enum AVPixelFormat in_format;

> > +    enum AVPixelFormat out_format;

> > +    int ret;

> > +

> > +    /* check that we have a hw context */

> > +    if (!ctx->inputs[0]->hw_frames_ctx) {

> > +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on

> > input\n");

> > +        return AVERROR(EINVAL);

> > +    }

> > +    in_frames_ctx =

> > (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;

> > +    in_format     = in_frames_ctx->sw_format;

> > +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format :

> > s->format; +

> > +    if (!format_is_supported(in_format)) {

> > +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",

> > +               av_get_pix_fmt_name(in_format));

> > +        return AVERROR(ENOSYS);

> > +    }

> > +    if (!format_is_supported(out_format)) {

> > +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",

> > +               av_get_pix_fmt_name(out_format));

> > +        return AVERROR(ENOSYS);

> > +    }

> > +

> > +    if (in_width == out_width && in_height == out_height)

> > +        s->passthrough = 1;

> > +

> > +    s->in_fmt = in_format;

> > +    s->out_fmt = out_format;

> > +

> > +    s->planes_in[0].width   = in_width;

> > +    s->planes_in[0].height  = in_height;

> > +    s->planes_in[0].left = left;

> > +    s->planes_in[0].top = top;

> > +    s->planes_out[0].width  = out_width;

> > +    s->planes_out[0].height = out_height;

> > +    s->planes_out[0].left = 0;

> > +    s->planes_out[0].top = 0;

> > +

> > +    ret = init_stage(s, in_frames_ctx->device_ref);

> > +    if (ret < 0)

> > +        return ret;

> > +

> > +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);

> > +    if (!ctx->outputs[0]->hw_frames_ctx)

> > +        return AVERROR(ENOMEM);

> > +

> > +    return 0;

> > +}

> > +

> > +static inline int normalize_double(int *n, double d)

> > +{

> > +    int ret = 0;

> > +

> > +    if (isnan(d))

> > +        ret = AVERROR(EINVAL);

> > +    else if (d > INT_MAX || d < INT_MIN) {

> > +        *n = d > INT_MAX ? INT_MAX : INT_MIN;

> > +        ret = AVERROR(EINVAL);

> > +    } else

> > +        *n = lrint(d);

> > +

> > +    return ret;

> > +}

> > +

> > +static av_cold int cudacrop_config_input(AVFilterLink *inlink)

> > +{

> > +    AVFilterContext *ctx = inlink->dst;

> > +    CUDACropContext *s = ctx->priv;

> > +    double res;

> > +    int ret;

> > +

> > +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;

> > +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;

> > +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;

> > +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;

> > +    s->var_values[VAR_X] = NAN;

> > +    s->var_values[VAR_Y] = NAN;

> > +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,

> > +                                      var_names, s->var_values,

> > +                                      NULL, NULL, NULL, NULL, NULL,

> > 0, ctx)) < 0)

> > +        goto fail;

> > +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;

> > +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,

> > +                                      var_names, s->var_values,

> > +                                      NULL, NULL, NULL, NULL, NULL,

> > 0, ctx)) < 0)

> > +        goto fail;

> > +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;

> > +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,

> > +                                      var_names, s->var_values,

> > +                                      NULL, NULL, NULL, NULL, NULL,

> > 0, ctx)) < 0)

> > +        goto fail;

> > +    s->var_values[VAR_X] = res;

> > +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,

> > +                                      var_names, s->var_values,

> > +                                      NULL, NULL, NULL, NULL, NULL,

> > 0, ctx)) < 0)

> > +        goto fail;

> > +    s->var_values[VAR_Y] = res;

> > +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||

> > +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||

> > +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||

> > +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {

> > +        av_log(ctx, AV_LOG_ERROR,

> > +               "Too big value or invalid expression for out_w/ow or

> > out_h/oh or x or y");

> > +        return AVERROR(EINVAL);

> > +    }

> > +

> > +fail:

> > +    return ret;

> > +}

> > +

> > +static av_cold int cudacrop_config_output(AVFilterLink *outlink)

> > +{

> > +    AVFilterContext *ctx = outlink->src;

> > +    AVFilterLink *inlink = outlink->src->inputs[0];

> > +    CUDACropContext *s  = ctx->priv;

> > +    AVHWFramesContext     *frames_ctx =

> > (AVHWFramesContext*)inlink->hw_frames_ctx->data;

> > +    AVCUDADeviceContext *device_hwctx =

> > frames_ctx->device_ctx->hwctx;

> > +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;

> > +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;

> > +    int ret;

> > +

> > +    extern char vf_crop_cuda_ptx[];

> > +

> > +    s->hwctx = device_hwctx;

> > +    s->cu_stream = s->hwctx->stream;

> > +

> > +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module,

> > vf_crop_cuda_ptx));

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar,

> > s->cu_module, "Crop_uchar"));

> > +    if (ret < 0)

> > +        goto fail;

> > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2,

> > s->cu_module, "Crop_uchar2"));

> > +    if (ret < 0)

> > +        goto fail;

> > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4,

> > s->cu_module, "Crop_uchar4"));

> > +    if (ret < 0)

> > +        goto fail;

> > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort,

> > s->cu_module, "Crop_ushort"));

> > +    if (ret < 0)

> > +        goto fail;

> > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2,

> > s->cu_module, "Crop_ushort2"));

> > +    if (ret < 0)

> > +        goto fail;

> > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4,

> > s->cu_module, "Crop_ushort4"));

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));

> > +

> > +    outlink->w = s->w;

> > +    outlink->h = s->h;

> > +

> > +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w,

> > s->h, s->x, s->y);

> > +    if (ret < 0)

> > +        return ret;

> > +

> > +    if (inlink->sample_aspect_ratio.num) {

> > +        outlink->sample_aspect_ratio =

> > av_mul_q((AVRational){outlink->h*inlink->w,

> > +

> > outlink->w*inlink->h},

> > +

> > inlink->sample_aspect_ratio);

> > +    } else {

> > +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;

> > +    }

> > +

> > +    return 0;

> > +

> > +fail:

> > +    return ret;

> > +}

> > +

> > +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func,

> > int channels,

> > +                              uint8_t *src_dptr, int src_width, int

> > src_height, int src_pitch,

> > +                              uint8_t *dst_dptr, int dst_width, int

> > dst_height, int dst_pitch,

> > +                              int left, int top, int pixel_size)

> > +{

> > +    CUDACropContext *s = ctx->priv;

> > +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;

> > +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;

> > +    CUtexObject tex = 0;

> > +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width,

> > &dst_height, &dst_pitch, &left, &top };

> > +    int ret;

> > +

> > +    CUDA_TEXTURE_DESC tex_desc = {

> > +        .filterMode = CU_TR_FILTER_MODE_LINEAR,

> > +        .flags = CU_TRSF_READ_AS_INTEGER,

> > +    };

> > +

> > +    CUDA_RESOURCE_DESC res_desc = {

> > +        .resType = CU_RESOURCE_TYPE_PITCH2D,

> > +        .res.pitch2D.format = pixel_size == 1 ?

> > +                              CU_AD_FORMAT_UNSIGNED_INT8 :

> > +                              CU_AD_FORMAT_UNSIGNED_INT16,

> > +        .res.pitch2D.numChannels = channels,

> > +        .res.pitch2D.width = src_width,

> > +        .res.pitch2D.height = src_height,

> > +        .res.pitch2D.pitchInBytes = src_pitch,

> > +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,

> > +    };

> > +

> > +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,

> > NULL));

> > +    if (ret < 0)

> > +        goto exit;

> > +

> > +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width,

> > BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0,

> > s->cu_stream, args_uchar, NULL)); + +exit:

> > +    if (tex)

> > +        CHECK_CU(cu->cuTexObjectDestroy(tex));

> > +    return ret;

> > +}

> > +

> > +static int cropcuda_crop_internal(AVFilterContext *ctx,

> > +                            AVFrame *out, AVFrame *in)

> > +{

> > +    AVHWFramesContext *in_frames_ctx =

> > (AVHWFramesContext*)in->hw_frames_ctx->data;

> > +    CUDACropContext *s = ctx->priv;

> > +

> > +    switch (in_frames_ctx->sw_format) {

> > +    case AV_PIX_FMT_YUV420P:

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0], in->width, in->height,

> > in->linesize[0],

> > +                           out->data[0], out->width, out->height,

> > out->linesize[0],

> > +                           s->planes_in[0].left,

> > s->planes_in[0].top, 1);

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0]+in->linesize[0]*in->height,

> > in->width/2, in->height/2, in->linesize[0]/2,

> > +

> > out->data[0]+out->linesize[0]*out->height, out->width/2,

> > out->height/2, out->linesize[0]/2,

> > +                           s->planes_in[1].left,

> > s->planes_in[1].top, 1);

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0]+

> > ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment),

> > in->width/2, in->height/2, in->linesize[0]/2,

> > +

> > out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,

> > out->height/2, out->linesize[0]/2,

> > +                           s->planes_in[2].left,

> > s->planes_in[2].top, 1);

> > +        break;

> > +    case AV_PIX_FMT_YUV444P:

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0], in->width, in->height,

> > in->linesize[0],

> > +                           out->data[0], out->width, out->height,

> > out->linesize[0],

> > +                           s->planes_in[0].left,

> > s->planes_in[0].top, 1);

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0]+in->linesize[0]*in->height,

> > in->width, in->height, in->linesize[0],

> > +

> > out->data[0]+out->linesize[0]*out->height, out->width, out->height,

> > out->linesize[0],

> > +                           s->planes_in[1].left,

> > s->planes_in[1].top, 1);

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0]+in->linesize[0]*in->height*2,

> > in->width, in->height, in->linesize[0],

> > +

> > out->data[0]+out->linesize[0]*out->height*2, out->width, out->height,

> > out->linesize[0],

> > +                           s->planes_in[2].left,

> > s->planes_in[2].top, 1);

> > +        break;

> > +    case AV_PIX_FMT_NV12:

> > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,

> > +                           in->data[0], in->width, in->height,

> > in->linesize[0],

> > +                           out->data[0], out->width, out->height,

> > out->linesize[0],

> > +                           s->planes_in[0].left,

> > s->planes_in[0].top, 1);

> > +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,

> > +                           in->data[1], in->width/2, in->height/2,

> > in->linesize[1],

> > +                           out->data[0] + out->linesize[0] *

> > ((out->height + 31) & ~0x1f), out->width/2, out->height/2,

> > out->linesize[1]/2,

> > +                           s->planes_in[1].left,

> > s->planes_in[1].top, 1);

> > +        break;

> > +    case AV_PIX_FMT_P010LE:

> > +        call_crop_kernel(ctx, s->cu_func_ushort, 1,

> > +                           in->data[0], in->width, in->height,

> > in->linesize[0]/2,

> > +                           out->data[0], out->width, out->height,

> > out->linesize[0]/2,

> > +                           s->planes_in[0].left,

> > s->planes_in[0].top, 2);

> > +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,

> > +                           in->data[1], in->width / 2, in->height /

> > 2, in->linesize[1]/2,

> > +                           out->data[0] + out->linesize[0] *

> > ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,

> > out->linesize[1] / 4,

> > +                           s->planes_in[1].left,

> > s->planes_in[1].top, 2);

> > +        break;

> > +    case AV_PIX_FMT_P016LE:

> > +        call_crop_kernel(ctx, s->cu_func_ushort, 1,

> > +                           in->data[0], in->width, in->height,

> > in->linesize[0] / 2,

> > +                           out->data[0], out->width, out->height,

> > out->linesize[0] / 2,

> > +                           s->planes_in[0].left,

> > s->planes_in[0].top, 2);

> > +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,

> > +                           in->data[1], in->width / 2, in->height /

> > 2, in->linesize[1] / 2,

> > +                           out->data[0] + out->linesize[0] *

> > ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,

> > out->linesize[1] / 4,

> > +                           s->planes_in[1].left,

> > s->planes_in[1].top, 2);

> > +        break;

> > +    default:

> > +        return AVERROR_BUG;

> > +    }

> > +

> > +    return 0;

> > +}

> > +

> > +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame

> > *in) +{

> > +    CUDACropContext *s = ctx->priv;

> > +    AVFrame *src = in;

> > +    int ret;

> > +

> > +    ret = cropcuda_crop_internal(ctx, s->frame, src);

> > +    if (ret < 0)

> > +        return ret;

> > +

> > +    src = s->frame;

> > +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);

> > +    if (ret < 0)

> > +        return ret;

> > +

> > +    av_frame_move_ref(out, s->frame);

> > +    av_frame_move_ref(s->frame, s->tmp_frame);

> > +

> > +    ret = av_frame_copy_props(out, in);

> > +    if (ret < 0)

> > +        return ret;

> > +

> > +    return 0;

> > +}

> > +

> > +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)

> > +{

> > +    AVFilterContext              *ctx = link->dst;

> > +    CUDACropContext              *s = ctx->priv;

> > +    AVFilterLink             *outlink = ctx->outputs[0];

> > +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;

> > +

> > +    AVFrame *out = NULL;

> > +    CUcontext dummy;

> > +    int ret = 0;

> > +

> > +    out = av_frame_alloc();

> > +    if (!out) {

> > +        ret = AVERROR(ENOMEM);

> > +        goto fail;

> > +    }

> > +

> > +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    ret = cudacrop_crop(ctx, out, in);

> > +

> > +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));

> > +    if (ret < 0)

> > +        goto fail;

> > +

> > +    av_reduce(&out->sample_aspect_ratio.num,

> > &out->sample_aspect_ratio.den,

> > +              (int64_t)in->sample_aspect_ratio.num * outlink->h *

> > link->w,

> > +              (int64_t)in->sample_aspect_ratio.den * outlink->w *

> > link->h,

> > +              INT_MAX);

> > +

> > +    av_frame_free(&in);

> > +    return ff_filter_frame(outlink, out);

> > +fail:

> > +    av_frame_free(&in);

> > +    av_frame_free(&out);

> > +    return ret;

> > +}

> > +

> > +#define OFFSET(x) offsetof(CUDACropContext, x)

> > +#define FLAGS

> (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)

> > +static const AVOption options[] = {

> > +    { "w",      "set the width crop area expression",

> > OFFSET(w_expr),     AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags =

> > FLAGS },

> > +    { "h",      "set the height crop area expression",

> > OFFSET(h_expr),     AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags =

> > FLAGS },

> > +    { "x",      "set the x crop area expression",

> > OFFSET(x_expr),     AV_OPT_TYPE_STRING, { .str =

> > "(in_w-out_w)/2"}, .flags = FLAGS },

> > +    { "y",      "set the y crop area expression",

> > OFFSET(y_expr),     AV_OPT_TYPE_STRING, { .str =

> > "(in_h-out_h)/2"}, .flags = FLAGS },

> > +    { NULL },

> > +};

> > +

> > +static const AVClass cudacrop_class = {

> > +    .class_name = "cudacrop",

> > +    .item_name  = av_default_item_name,

> > +    .option     = options,

> > +    .version    = LIBAVUTIL_VERSION_INT,

> > +};

> > +

> > +static const AVFilterPad cudacrop_inputs[] = {

> > +    {

> > +        .name        = "default",

> > +        .type        = AVMEDIA_TYPE_VIDEO,

> > +        .filter_frame = cudacrop_filter_frame,

> > +        .config_props = cudacrop_config_input,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +static const AVFilterPad cudacrop_outputs[] = {

> > +    {

> > +        .name         = "default",

> > +        .type         = AVMEDIA_TYPE_VIDEO,

> > +        .config_props = cudacrop_config_output,

> > +    },

> > +    { NULL }

> > +};

> > +

> > +AVFilter ff_vf_crop_cuda = {

> > +    .name      = "crop_cuda",

> > +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video

> > crop"), +

> > +    .init          = cudacrop_init,

> > +    .uninit        = cudacrop_uninit,

> > +    .query_formats = cudacrop_query_formats,

> > +

> > +    .priv_size = sizeof(CUDACropContext),

> > +    .priv_class = &cudacrop_class,

> > +

> > +    .inputs    = cudacrop_inputs,

> > +    .outputs   = cudacrop_outputs,

> > +

> > +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,

> > +};

> > diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu

> > new file mode 100644

> > index 0000000..4b94b73

> > --- /dev/null

> > +++ b/libavfilter/vf_crop_cuda.cu

> > @@ -0,0 +1,109 @@

> > +/*

> > + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.

> > + *

> > + * Permission is hereby granted, free of charge, to any person

> > obtaining a

> > + * copy of this software and associated documentation files (the

> > "Software"),

> > + * to deal in the Software without restriction, including without

> > limitation

> > + * the rights to use, copy, modify, merge, publish, distribute,

> > sublicense,

> > + * and/or sell copies of the Software, and to permit persons to whom

> > the

> > + * Software is furnished to do so, subject to the following

> > conditions:

> > + *

> > + * The above copyright notice and this permission notice shall be

> > included in

> > + * all copies or substantial portions of the Software.

> > + *

> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

> > EXPRESS OR

> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

> > MERCHANTABILITY,

> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO

> > EVENT SHALL

> > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,

> DAMAGES

> > OR OTHER

> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

> > ARISING

> > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

> > OTHER

> > + * DEALINGS IN THE SOFTWARE.

> > + */

> > +

> > +extern "C" {

> > +

> > +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,

> > +                           unsigned char *dst,

> > +                           int dst_width, int dst_height, int

> > dst_pitch,

> > +                           int left, int top)

> > +{

> > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> > +    int xi = xo + left;

> > +    int yi = yo + top;

> > +

> > +    if (yo < dst_height && xo < dst_width)

> > +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned

> > char>(uchar_tex, xi, yi); +}

> > +

> > +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,

> > +                            uchar2 *dst,

> > +                            int dst_width, int dst_height, int

> > dst_pitch,

> > +                            int left, int top)

> > +{

> > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> > +    int xi = xo + left;

> > +    int yi = yo + top;

> > +

> > +    if (yo < dst_height && xo < dst_width)

> > +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex,

> > xi, yi); +}

> > +

> > +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,

> > +                            uchar4 *dst,

> > +                            int dst_width, int dst_height, int

> > dst_pitch,

> > +                            int left, int top)

> > +{

> > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> > +    int xi = xo + left;

> > +    int yi = yo + top;

> > +

> > +    if (yo < dst_height && xo < dst_width)

> > +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex,

> > xi, yi); +}

> > +

> > +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,

> > +                            unsigned short *dst,

> > +                            int dst_width, int dst_height, int

> > dst_pitch,

> > +                            int left, int top)

> > +{

> > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> > +    int xi = xo + left;

> > +    int yi = yo + top;

> > +

> > +    if (yo < dst_height && xo < dst_width)

> > +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned

> > short>(ushort_tex, xi, yi); +}

> > +

> > +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,

> > +                             ushort2 *dst,

> > +                             int dst_width, int dst_height, int

> > dst_pitch,

> > +                             int left, int top)

> > +{

> > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> > +    int xi = xo + left;

> > +    int yi = yo + top;

> > +

> > +    if (yo < dst_height && xo < dst_width)

> > +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex,

> > xi, yi); +}

> > +

> > +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,

> > +                             ushort4 *dst,

> > +                             int dst_width, int dst_height, int

> > dst_pitch,

> > +                             int left, int top)

> > +{

> > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;

> > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;

> > +    int xi = xo + left;

> > +    int yi = yo + top;

> > +

> > +    if (yo < dst_height && xo < dst_width)

> > +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex,

> > xi, yi); +}

> > +

> > +}

> 

> 

> 

> 

> --phil

> _______________________________________________

> ffmpeg-devel mailing list

> ffmpeg-devel@ffmpeg.org

> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

> 

> To unsubscribe, visit link above, or email

> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
leozhang March 25, 2019, 8:27 a.m. UTC | #7
Song, Ruiling <ruiling.song@intel.com> 于2019年3月25日周一 下午3:26写道:
>
>
>
> > -----Original Message-----
> > From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> > Philip Langdale via ffmpeg-devel
> > Sent: Monday, March 25, 2019 12:57 PM
> > To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> > Cc: Philip Langdale <philipl@overt.org>
> > Subject: Re: [FFmpeg-devel] [PATCH][FFmpeg-devel v2] Add GPU accelerated
> > video crop filter
> >
> > On Sat, 23 Mar 2019 23:51:10 +0800
> > UsingtcNower <nowerzt@gmail.com> wrote:
> >
> > > Signed-off-by: UsingtcNower <nowerzt@gmail.com>
> > > ---
> > >  Changelog                   |   1 +
> > >  configure                   |   1 +
> > >  doc/filters.texi            |  31 +++
> > >  libavfilter/Makefile        |   1 +
> > >  libavfilter/allfilters.c    |   1 +
> > >  libavfilter/version.h       |   2 +-
> > >  libavfilter/vf_crop_cuda.c  | 638
> > > ++++++++++++++++++++++++++++++++++++++++++++
> > > libavfilter/vf_crop_cuda.cu | 109 ++++++++ 8 files changed, 783
> > > insertions(+), 1 deletion(-) create mode 100644
> > > libavfilter/vf_crop_cuda.c create mode 100644
> > > libavfilter/vf_crop_cuda.cu
> > >
> > > diff --git a/Changelog b/Changelog
> > > index ad7e82f..f224fc8 100644
> > > --- a/Changelog
> > > +++ b/Changelog
> > > @@ -20,6 +20,7 @@ version <next>:
> > >  - libaribb24 based ARIB STD-B24 caption support (profiles A and C)
> > >  - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec
> > >  - removed libndi-newtek
> > > +- crop_cuda GPU accelerated video crop filter
> >
> > Hi,
> >
> > Timo and Mark and I have been discussing this, and we think the right
> > thing to do is add support to vf_scale_cuda to respect the crop
> > properties on an input AVFrame. Mark posted a patch to vf_crop to
> > ensure that the properties are set, and then the scale filter should
> > respect those properties if they are set. You can look at
> > vf_scale_vaapi for how the properties are read, but they will require
> > explicit handling to adjust the src dimensions passed to the scale
> > filter.
Maybe a little not intuitive to users.
> >
> > This will be a more efficient way of handling crops, in terms of total
> > lines of code and also allowing crop/scale with one less copy.
> >
> > I know this is quite different from the approach you've taken here, and
> > we appreciate the work you've done, but it should be better overall to
> > implement this integrated method.
> Hi Philip,
>
> Glad to hear you guys had discussion on this. As I am also considering the problem, I have some questions about your idea.
> So, what if user did not insert a scale_cuda after crop filter? Do you plan to automatically insert scale_cuda or just ignore the crop?
> What if user want to do crop,transpose_cuda,scale_cuda? So we also need to handle crop inside transpose_cuda filter?
I have the same question.
> (looks like we do not have transpose_cuda right now, but this filter seems needed if user want to do transpose job using cuda.)
>
> Thanks!
> Ruiling
> >
> > Thanks,
> >
> > >
> > >  version 4.1:
> > > diff --git a/configure b/configure
> > > index 331393f..3f3ac2f 100755
> > > --- a/configure
> > > +++ b/configure
> > > @@ -2973,6 +2973,7 @@ qsvvpp_select="qsv"
> > >  vaapi_encode_deps="vaapi"
> > >  v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
> > >
> > > +crop_cuda_filter_deps="ffnvcodec cuda_nvcc"
> > >  hwupload_cuda_filter_deps="ffnvcodec"
> > >  scale_npp_filter_deps="ffnvcodec libnpp"
> > >  scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
> > > diff --git a/doc/filters.texi b/doc/filters.texi
> > > index 4ffb392..ee16a2d 100644
> > > --- a/doc/filters.texi
> > > +++ b/doc/filters.texi
> > > @@ -7415,6 +7415,37 @@ If the specified expression is not valid, it
> > > is kept at its current value.
> > >  @end table
> > >
> > > +@section crop_cuda
> > > +
> > > +Crop the input video to given dimensions, implemented in CUDA.
> > > +
> > > +It accepts the following parameters:
> > > +
> > > +@table @option
> > > +
> > > +@item w
> > > +The width of the output video. It defaults to @code{iw}.
> > > +This expression is evaluated only once during the filter
> > > +configuration.
> > > +
> > > +@item h
> > > +The height of the output video. It defaults to @code{ih}.
> > > +This expression is evaluated only once during the filter
> > > +configuration.
> > > +
> > > +@item x
> > > +The horizontal position, in the input video, of the left edge of the
> > > output +video. It defaults to @code{(in_w-out_w)/2}.
> > > +This expression is evaluated only once during the filter
> > > +configuration.
> > > +
> > > +@item y
> > > +The vertical position, in the input video, of the top edge of the
> > > output video. +It defaults to @code{(in_h-out_h)/2}.
> > > +This expression is evaluated only once during the filter
> > > +configuration.
> > > +@end table
> > > +
> > >  @section cropdetect
> > >
> > >  Auto-detect the crop size.
> > > diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> > > index fef6ec5..84df037 100644
> > > --- a/libavfilter/Makefile
> > > +++ b/libavfilter/Makefile
> > > @@ -187,6 +187,7 @@ OBJS-$(CONFIG_COPY_FILTER)                   +=
> > > vf_copy.o OBJS-$(CONFIG_COREIMAGE_FILTER)              +=
> > > vf_coreimage.o OBJS-$(CONFIG_COVER_RECT_FILTER)             +=
> > > vf_cover_rect.o lavfutils.o
> > > OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o
> > > +OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o
> > > vf_crop_cuda.ptx.o OBJS-$(CONFIG_CROPDETECT_FILTER)             +=
> > > vf_cropdetect.o OBJS-$(CONFIG_CUE_FILTER)                    +=
> > > f_cue.o OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o
> > > diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> > > index c51ae0f..550e545 100644 --- a/libavfilter/allfilters.c
> > > +++ b/libavfilter/allfilters.c
> > > @@ -175,6 +175,7 @@ extern AVFilter ff_vf_copy;
> > >  extern AVFilter ff_vf_coreimage;
> > >  extern AVFilter ff_vf_cover_rect;
> > >  extern AVFilter ff_vf_crop;
> > > +extern AVFilter ff_vf_crop_cuda;
> > >  extern AVFilter ff_vf_cropdetect;
> > >  extern AVFilter ff_vf_cue;
> > >  extern AVFilter ff_vf_curves;
> > > diff --git a/libavfilter/version.h b/libavfilter/version.h
> > > index c71282c..5aa95f4 100644
> > > --- a/libavfilter/version.h
> > > +++ b/libavfilter/version.h
> > > @@ -31,7 +31,7 @@
> > >
> > >  #define LIBAVFILTER_VERSION_MAJOR   7
> > >  #define LIBAVFILTER_VERSION_MINOR  48
> > > -#define LIBAVFILTER_VERSION_MICRO 100
> > > +#define LIBAVFILTER_VERSION_MICRO 101
> > >
> > >  #define LIBAVFILTER_VERSION_INT
> > > AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
> > > LIBAVFILTER_VERSION_MINOR, \ diff --git a/libavfilter/vf_crop_cuda.c
> > > b/libavfilter/vf_crop_cuda.c new file mode 100644
> > > index 0000000..fc6a2a6
> > > --- /dev/null
> > > +++ b/libavfilter/vf_crop_cuda.c
> > > @@ -0,0 +1,638 @@
> > > +/*
> > > +* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> > > +*
> > > +* Permission is hereby granted, free of charge, to any person
> > > obtaining a +* copy of this software and associated documentation
> > > files (the "Software"), +* to deal in the Software without
> > > restriction, including without limitation +* the rights to use, copy,
> > > modify, merge, publish, distribute, sublicense, +* and/or sell copies
> > > of the Software, and to permit persons to whom the +* Software is
> > > furnished to do so, subject to the following conditions: +*
> > > +* The above copyright notice and this permission notice shall be
> > > included in +* all copies or substantial portions of the Software.
> > > +*
> > > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > > EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
> > OF
> > > MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND
> > > NONINFRINGEMENT.  IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT
> > > HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY,
> > > WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*
> > FROM,
> > > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +*
> > > DEALINGS IN THE SOFTWARE. +*/
> > > +
> > > +#include <stdio.h>
> > > +#include <string.h>
> > > +
> > > +#include "libavutil/avstring.h"
> > > +#include "libavutil/common.h"
> > > +#include "libavutil/hwcontext.h"
> > > +#include "libavutil/hwcontext_cuda_internal.h"
> > > +#include "libavutil/cuda_check.h"
> > > +#include "libavutil/internal.h"
> > > +#include "libavutil/opt.h"
> > > +#include "libavutil/pixdesc.h"
> > > +#include "libavutil/eval.h"
> > > +
> > > +#include "avfilter.h"
> > > +#include "formats.h"
> > > +#include "internal.h"
> > > +#include "video.h"
> > > +
> > > +static const char *const var_names[] = {
> > > +    "in_w", "iw",   ///< width  of the input video
> > > +    "in_h", "ih",   ///< height of the input video
> > > +    "out_w", "ow",  ///< width  of the cropped video
> > > +    "out_h", "oh",  ///< height of the cropped video
> > > +    "x",
> > > +    "y",
> > > +    NULL
> > > +};
> > > +
> > > +enum var_name {
> > > +    VAR_IN_W,  VAR_IW,
> > > +    VAR_IN_H,  VAR_IH,
> > > +    VAR_OUT_W, VAR_OW,
> > > +    VAR_OUT_H, VAR_OH,
> > > +    VAR_X,
> > > +    VAR_Y,
> > > +    VAR_VARS_NB
> > > +};
> > > +
> > > +static const enum AVPixelFormat supported_formats[] = {
> > > +    AV_PIX_FMT_YUV420P,
> > > +    AV_PIX_FMT_NV12,
> > > +    AV_PIX_FMT_YUV444P,
> > > +    AV_PIX_FMT_P010,
> > > +    AV_PIX_FMT_P016
> > > +};
> > > +
> > > +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
> > > +#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))
> > > +#define NUM_BUFFERS 2
> > > +#define BLOCKX 32
> > > +#define BLOCKY 16
> > > +
> > > +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx,
> > > s->hwctx->internal->cuda_dl, x) +
> > > +typedef struct CUDACropContext {
> > > +    const AVClass *class;
> > > +    AVCUDADeviceContext *hwctx;
> > > +    enum AVPixelFormat in_fmt;
> > > +    enum AVPixelFormat out_fmt;
> > > +
> > > +    struct {
> > > +        int width;
> > > +        int height;
> > > +        int left;
> > > +        int top;
> > > +    } planes_in[3], planes_out[3];
> > > +
> > > +    AVBufferRef *frames_ctx;
> > > +    AVFrame     *frame;
> > > +
> > > +    AVFrame *tmp_frame;
> > > +    int passthrough;
> > > +
> > > +    /**
> > > +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
> > > +     */
> > > +    enum AVPixelFormat format;
> > > +
> > > +    int w,h,x,y;
> > > +    char *w_expr, *h_expr, *x_expr, *y_expr;
> > > +    double var_values[VAR_VARS_NB];
> > > +
> > > +    CUcontext   cu_ctx;
> > > +    CUmodule    cu_module;
> > > +    CUfunction  cu_func_uchar;
> > > +    CUfunction  cu_func_uchar2;
> > > +    CUfunction  cu_func_uchar4;
> > > +    CUfunction  cu_func_ushort;
> > > +    CUfunction  cu_func_ushort2;
> > > +    CUfunction  cu_func_ushort4;
> > > +    CUstream    cu_stream;
> > > +
> > > +    CUdeviceptr srcBuffer;
> > > +    CUdeviceptr dstBuffer;
> > > +    int         tex_alignment;
> > > +} CUDACropContext;
> > > +
> > > +static av_cold int cudacrop_init(AVFilterContext *ctx)
> > > +{
> > > +    CUDACropContext *s = ctx->priv;
> > > +
> > > +    s->format = AV_PIX_FMT_NONE;
> > > +    s->frame = av_frame_alloc();
> > > +    if (!s->frame)
> > > +        return AVERROR(ENOMEM);
> > > +
> > > +    s->tmp_frame = av_frame_alloc();
> > > +    if (!s->tmp_frame)
> > > +        return AVERROR(ENOMEM);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static av_cold void cudacrop_uninit(AVFilterContext *ctx)
> > > +{
> > > +    CUDACropContext *s = ctx->priv;
> > > +
> > > +    av_frame_free(&s->frame);
> > > +    av_buffer_unref(&s->frames_ctx);
> > > +    av_frame_free(&s->tmp_frame);
> > > +}
> > > +
> > > +static int cudacrop_query_formats(AVFilterContext *ctx)
> > > +{
> > > +    static const enum AVPixelFormat pixel_formats[] = {
> > > +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
> > > +    };
> > > +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
> > > +
> > > +    return ff_set_common_formats(ctx, pix_fmts);
> > > +}
> > > +
> > > +static av_cold int init_stage(CUDACropContext *s, AVBufferRef
> > > *device_ctx) +{
> > > +    AVBufferRef *out_ref = NULL;
> > > +    AVHWFramesContext *out_ctx;
> > > +    int in_sw, in_sh, out_sw, out_sh;
> > > +    int ret, i;
> > > +
> > > +    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);
> > > +    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);
> > > +    if (!s->planes_out[0].width) {
> > > +        s->planes_out[0].width  = s->planes_in[0].width;
> > > +        s->planes_out[0].height = s->planes_in[0].height;
> > > +        s->planes_out[0].left = s->planes_in[0].left;
> > > +        s->planes_out[0].top  = s->planes_in[0].top;
> > > +    }
> > > +
> > > +    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {
> > > +        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;
> > > +        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;
> > > +        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;
> > > +        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;
> > > +        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;
> > > +        s->planes_out[i].height = s->planes_out[0].height >> out_sh;
> > > +        s->planes_out[i].left   = 0;
> > > +        s->planes_out[i].top    = 0;
> > > +
> > > +    }
> > > +
> > > +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> > > +    if (!out_ref)
> > > +        return AVERROR(ENOMEM);
> > > +    out_ctx = (AVHWFramesContext*)out_ref->data;
> > > +
> > > +    out_ctx->format    = AV_PIX_FMT_CUDA;
> > > +    out_ctx->sw_format = s->out_fmt;
> > > +    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);
> > > +    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);
> > > +
> > > +    ret = av_hwframe_ctx_init(out_ref);
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    av_frame_unref(s->frame);
> > > +    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    s->frame->width  = s->planes_out[0].width;
> > > +    s->frame->height = s->planes_out[0].height;
> > > +
> > > +    av_buffer_unref(&s->frames_ctx);
> > > +    s->frames_ctx = out_ref;
> > > +
> > > +    return 0;
> > > +fail:
> > > +    av_buffer_unref(&out_ref);
> > > +    return ret;
> > > +}
> > > +
> > > +static int format_is_supported(enum AVPixelFormat fmt)
> > > +{
> > > +    int i;
> > > +
> > > +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> > > +        if (supported_formats[i] == fmt)
> > > +            return 1;
> > > +    return 0;
> > > +}
> > > +
> > > +static av_cold int init_processing_chain(AVFilterContext *ctx, int
> > > in_width, int in_height,
> > > +                                         int out_width, int
> > > out_height,
> > > +                                         int left, int top)
> > > +{
> > > +    CUDACropContext *s = ctx->priv;
> > > +
> > > +    AVHWFramesContext *in_frames_ctx;
> > > +
> > > +    enum AVPixelFormat in_format;
> > > +    enum AVPixelFormat out_format;
> > > +    int ret;
> > > +
> > > +    /* check that we have a hw context */
> > > +    if (!ctx->inputs[0]->hw_frames_ctx) {
> > > +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on
> > > input\n");
> > > +        return AVERROR(EINVAL);
> > > +    }
> > > +    in_frames_ctx =
> > > (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> > > +    in_format     = in_frames_ctx->sw_format;
> > > +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format :
> > > s->format; +
> > > +    if (!format_is_supported(in_format)) {
> > > +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
> > > +               av_get_pix_fmt_name(in_format));
> > > +        return AVERROR(ENOSYS);
> > > +    }
> > > +    if (!format_is_supported(out_format)) {
> > > +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
> > > +               av_get_pix_fmt_name(out_format));
> > > +        return AVERROR(ENOSYS);
> > > +    }
> > > +
> > > +    if (in_width == out_width && in_height == out_height)
> > > +        s->passthrough = 1;
> > > +
> > > +    s->in_fmt = in_format;
> > > +    s->out_fmt = out_format;
> > > +
> > > +    s->planes_in[0].width   = in_width;
> > > +    s->planes_in[0].height  = in_height;
> > > +    s->planes_in[0].left = left;
> > > +    s->planes_in[0].top = top;
> > > +    s->planes_out[0].width  = out_width;
> > > +    s->planes_out[0].height = out_height;
> > > +    s->planes_out[0].left = 0;
> > > +    s->planes_out[0].top = 0;
> > > +
> > > +    ret = init_stage(s, in_frames_ctx->device_ref);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
> > > +    if (!ctx->outputs[0]->hw_frames_ctx)
> > > +        return AVERROR(ENOMEM);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static inline int normalize_double(int *n, double d)
> > > +{
> > > +    int ret = 0;
> > > +
> > > +    if (isnan(d))
> > > +        ret = AVERROR(EINVAL);
> > > +    else if (d > INT_MAX || d < INT_MIN) {
> > > +        *n = d > INT_MAX ? INT_MAX : INT_MIN;
> > > +        ret = AVERROR(EINVAL);
> > > +    } else
> > > +        *n = lrint(d);
> > > +
> > > +    return ret;
> > > +}
> > > +
> > > +static av_cold int cudacrop_config_input(AVFilterLink *inlink)
> > > +{
> > > +    AVFilterContext *ctx = inlink->dst;
> > > +    CUDACropContext *s = ctx->priv;
> > > +    double res;
> > > +    int ret;
> > > +
> > > +    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;
> > > +    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;
> > > +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;
> > > +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;
> > > +    s->var_values[VAR_X] = NAN;
> > > +    s->var_values[VAR_Y] = NAN;
> > > +    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,
> > > +                                      var_names, s->var_values,
> > > +                                      NULL, NULL, NULL, NULL, NULL,
> > > 0, ctx)) < 0)
> > > +        goto fail;
> > > +    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;
> > > +    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,
> > > +                                      var_names, s->var_values,
> > > +                                      NULL, NULL, NULL, NULL, NULL,
> > > 0, ctx)) < 0)
> > > +        goto fail;
> > > +    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;
> > > +    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,
> > > +                                      var_names, s->var_values,
> > > +                                      NULL, NULL, NULL, NULL, NULL,
> > > 0, ctx)) < 0)
> > > +        goto fail;
> > > +    s->var_values[VAR_X] = res;
> > > +    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,
> > > +                                      var_names, s->var_values,
> > > +                                      NULL, NULL, NULL, NULL, NULL,
> > > 0, ctx)) < 0)
> > > +        goto fail;
> > > +    s->var_values[VAR_Y] = res;
> > > +    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||
> > > +        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||
> > > +        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||
> > > +        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {
> > > +        av_log(ctx, AV_LOG_ERROR,
> > > +               "Too big value or invalid expression for out_w/ow or
> > > out_h/oh or x or y");
> > > +        return AVERROR(EINVAL);
> > > +    }
> > > +
> > > +fail:
> > > +    return ret;
> > > +}
> > > +
> > > +static av_cold int cudacrop_config_output(AVFilterLink *outlink)
> > > +{
> > > +    AVFilterContext *ctx = outlink->src;
> > > +    AVFilterLink *inlink = outlink->src->inputs[0];
> > > +    CUDACropContext *s  = ctx->priv;
> > > +    AVHWFramesContext     *frames_ctx =
> > > (AVHWFramesContext*)inlink->hw_frames_ctx->data;
> > > +    AVCUDADeviceContext *device_hwctx =
> > > frames_ctx->device_ctx->hwctx;
> > > +    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
> > > +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
> > > +    int ret;
> > > +
> > > +    extern char vf_crop_cuda_ptx[];
> > > +
> > > +    s->hwctx = device_hwctx;
> > > +    s->cu_stream = s->hwctx->stream;
> > > +
> > > +    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module,
> > > vf_crop_cuda_ptx));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar,
> > > s->cu_module, "Crop_uchar"));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2,
> > > s->cu_module, "Crop_uchar2"));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4,
> > > s->cu_module, "Crop_uchar4"));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort,
> > > s->cu_module, "Crop_ushort"));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2,
> > > s->cu_module, "Crop_ushort2"));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4,
> > > s->cu_module, "Crop_ushort4"));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> > > +
> > > +    outlink->w = s->w;
> > > +    outlink->h = s->h;
> > > +
> > > +    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w,
> > > s->h, s->x, s->y);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    if (inlink->sample_aspect_ratio.num) {
> > > +        outlink->sample_aspect_ratio =
> > > av_mul_q((AVRational){outlink->h*inlink->w,
> > > +
> > > outlink->w*inlink->h},
> > > +
> > > inlink->sample_aspect_ratio);
> > > +    } else {
> > > +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> > > +    }
> > > +
> > > +    return 0;
> > > +
> > > +fail:
> > > +    return ret;
> > > +}
> > > +
> > > +static int call_crop_kernel(AVFilterContext *ctx, CUfunction func,
> > > int channels,
> > > +                              uint8_t *src_dptr, int src_width, int
> > > src_height, int src_pitch,
> > > +                              uint8_t *dst_dptr, int dst_width, int
> > > dst_height, int dst_pitch,
> > > +                              int left, int top, int pixel_size)
> > > +{
> > > +    CUDACropContext *s = ctx->priv;
> > > +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> > > +    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
> > > +    CUtexObject tex = 0;
> > > +    void *args_uchar[] = { &tex, &dst_devptr, &dst_width,
> > > &dst_height, &dst_pitch, &left, &top };
> > > +    int ret;
> > > +
> > > +    CUDA_TEXTURE_DESC tex_desc = {
> > > +        .filterMode = CU_TR_FILTER_MODE_LINEAR,
> > > +        .flags = CU_TRSF_READ_AS_INTEGER,
> > > +    };
> > > +
> > > +    CUDA_RESOURCE_DESC res_desc = {
> > > +        .resType = CU_RESOURCE_TYPE_PITCH2D,
> > > +        .res.pitch2D.format = pixel_size == 1 ?
> > > +                              CU_AD_FORMAT_UNSIGNED_INT8 :
> > > +                              CU_AD_FORMAT_UNSIGNED_INT16,
> > > +        .res.pitch2D.numChannels = channels,
> > > +        .res.pitch2D.width = src_width,
> > > +        .res.pitch2D.height = src_height,
> > > +        .res.pitch2D.pitchInBytes = src_pitch,
> > > +        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
> > > +    };
> > > +
> > > +    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc,
> > > NULL));
> > > +    if (ret < 0)
> > > +        goto exit;
> > > +
> > > +    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width,
> > > BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0,
> > > s->cu_stream, args_uchar, NULL)); + +exit:
> > > +    if (tex)
> > > +        CHECK_CU(cu->cuTexObjectDestroy(tex));
> > > +    return ret;
> > > +}
> > > +
> > > +static int cropcuda_crop_internal(AVFilterContext *ctx,
> > > +                            AVFrame *out, AVFrame *in)
> > > +{
> > > +    AVHWFramesContext *in_frames_ctx =
> > > (AVHWFramesContext*)in->hw_frames_ctx->data;
> > > +    CUDACropContext *s = ctx->priv;
> > > +
> > > +    switch (in_frames_ctx->sw_format) {
> > > +    case AV_PIX_FMT_YUV420P:
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0], in->width, in->height,
> > > in->linesize[0],
> > > +                           out->data[0], out->width, out->height,
> > > out->linesize[0],
> > > +                           s->planes_in[0].left,
> > > s->planes_in[0].top, 1);
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0]+in->linesize[0]*in->height,
> > > in->width/2, in->height/2, in->linesize[0]/2,
> > > +
> > > out->data[0]+out->linesize[0]*out->height, out->width/2,
> > > out->height/2, out->linesize[0]/2,
> > > +                           s->planes_in[1].left,
> > > s->planes_in[1].top, 1);
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0]+
> > > ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment),
> > > in->width/2, in->height/2, in->linesize[0]/2,
> > > +
> > > out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2,
> > > out->height/2, out->linesize[0]/2,
> > > +                           s->planes_in[2].left,
> > > s->planes_in[2].top, 1);
> > > +        break;
> > > +    case AV_PIX_FMT_YUV444P:
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0], in->width, in->height,
> > > in->linesize[0],
> > > +                           out->data[0], out->width, out->height,
> > > out->linesize[0],
> > > +                           s->planes_in[0].left,
> > > s->planes_in[0].top, 1);
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0]+in->linesize[0]*in->height,
> > > in->width, in->height, in->linesize[0],
> > > +
> > > out->data[0]+out->linesize[0]*out->height, out->width, out->height,
> > > out->linesize[0],
> > > +                           s->planes_in[1].left,
> > > s->planes_in[1].top, 1);
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0]+in->linesize[0]*in->height*2,
> > > in->width, in->height, in->linesize[0],
> > > +
> > > out->data[0]+out->linesize[0]*out->height*2, out->width, out->height,
> > > out->linesize[0],
> > > +                           s->planes_in[2].left,
> > > s->planes_in[2].top, 1);
> > > +        break;
> > > +    case AV_PIX_FMT_NV12:
> > > +        call_crop_kernel(ctx, s->cu_func_uchar, 1,
> > > +                           in->data[0], in->width, in->height,
> > > in->linesize[0],
> > > +                           out->data[0], out->width, out->height,
> > > out->linesize[0],
> > > +                           s->planes_in[0].left,
> > > s->planes_in[0].top, 1);
> > > +        call_crop_kernel(ctx, s->cu_func_uchar2, 2,
> > > +                           in->data[1], in->width/2, in->height/2,
> > > in->linesize[1],
> > > +                           out->data[0] + out->linesize[0] *
> > > ((out->height + 31) & ~0x1f), out->width/2, out->height/2,
> > > out->linesize[1]/2,
> > > +                           s->planes_in[1].left,
> > > s->planes_in[1].top, 1);
> > > +        break;
> > > +    case AV_PIX_FMT_P010LE:
> > > +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> > > +                           in->data[0], in->width, in->height,
> > > in->linesize[0]/2,
> > > +                           out->data[0], out->width, out->height,
> > > out->linesize[0]/2,
> > > +                           s->planes_in[0].left,
> > > s->planes_in[0].top, 2);
> > > +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> > > +                           in->data[1], in->width / 2, in->height /
> > > 2, in->linesize[1]/2,
> > > +                           out->data[0] + out->linesize[0] *
> > > ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> > > out->linesize[1] / 4,
> > > +                           s->planes_in[1].left,
> > > s->planes_in[1].top, 2);
> > > +        break;
> > > +    case AV_PIX_FMT_P016LE:
> > > +        call_crop_kernel(ctx, s->cu_func_ushort, 1,
> > > +                           in->data[0], in->width, in->height,
> > > in->linesize[0] / 2,
> > > +                           out->data[0], out->width, out->height,
> > > out->linesize[0] / 2,
> > > +                           s->planes_in[0].left,
> > > s->planes_in[0].top, 2);
> > > +        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
> > > +                           in->data[1], in->width / 2, in->height /
> > > 2, in->linesize[1] / 2,
> > > +                           out->data[0] + out->linesize[0] *
> > > ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2,
> > > out->linesize[1] / 4,
> > > +                           s->planes_in[1].left,
> > > s->planes_in[1].top, 2);
> > > +        break;
> > > +    default:
> > > +        return AVERROR_BUG;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame
> > > *in) +{
> > > +    CUDACropContext *s = ctx->priv;
> > > +    AVFrame *src = in;
> > > +    int ret;
> > > +
> > > +    ret = cropcuda_crop_internal(ctx, s->frame, src);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    src = s->frame;
> > > +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    av_frame_move_ref(out, s->frame);
> > > +    av_frame_move_ref(s->frame, s->tmp_frame);
> > > +
> > > +    ret = av_frame_copy_props(out, in);
> > > +    if (ret < 0)
> > > +        return ret;
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)
> > > +{
> > > +    AVFilterContext              *ctx = link->dst;
> > > +    CUDACropContext              *s = ctx->priv;
> > > +    AVFilterLink             *outlink = ctx->outputs[0];
> > > +    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
> > > +
> > > +    AVFrame *out = NULL;
> > > +    CUcontext dummy;
> > > +    int ret = 0;
> > > +
> > > +    out = av_frame_alloc();
> > > +    if (!out) {
> > > +        ret = AVERROR(ENOMEM);
> > > +        goto fail;
> > > +    }
> > > +
> > > +    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    ret = cudacrop_crop(ctx, out, in);
> > > +
> > > +    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
> > > +    if (ret < 0)
> > > +        goto fail;
> > > +
> > > +    av_reduce(&out->sample_aspect_ratio.num,
> > > &out->sample_aspect_ratio.den,
> > > +              (int64_t)in->sample_aspect_ratio.num * outlink->h *
> > > link->w,
> > > +              (int64_t)in->sample_aspect_ratio.den * outlink->w *
> > > link->h,
> > > +              INT_MAX);
> > > +
> > > +    av_frame_free(&in);
> > > +    return ff_filter_frame(outlink, out);
> > > +fail:
> > > +    av_frame_free(&in);
> > > +    av_frame_free(&out);
> > > +    return ret;
> > > +}
> > > +
> > > +#define OFFSET(x) offsetof(CUDACropContext, x)
> > > +#define FLAGS
> > (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
> > > +static const AVOption options[] = {
> > > +    { "w",      "set the width crop area expression",
> > > OFFSET(w_expr),     AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags =
> > > FLAGS },
> > > +    { "h",      "set the height crop area expression",
> > > OFFSET(h_expr),     AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags =
> > > FLAGS },
> > > +    { "x",      "set the x crop area expression",
> > > OFFSET(x_expr),     AV_OPT_TYPE_STRING, { .str =
> > > "(in_w-out_w)/2"}, .flags = FLAGS },
> > > +    { "y",      "set the y crop area expression",
> > > OFFSET(y_expr),     AV_OPT_TYPE_STRING, { .str =
> > > "(in_h-out_h)/2"}, .flags = FLAGS },
> > > +    { NULL },
> > > +};
> > > +
> > > +static const AVClass cudacrop_class = {
> > > +    .class_name = "cudacrop",
> > > +    .item_name  = av_default_item_name,
> > > +    .option     = options,
> > > +    .version    = LIBAVUTIL_VERSION_INT,
> > > +};
> > > +
> > > +static const AVFilterPad cudacrop_inputs[] = {
> > > +    {
> > > +        .name        = "default",
> > > +        .type        = AVMEDIA_TYPE_VIDEO,
> > > +        .filter_frame = cudacrop_filter_frame,
> > > +        .config_props = cudacrop_config_input,
> > > +    },
> > > +    { NULL }
> > > +};
> > > +
> > > +static const AVFilterPad cudacrop_outputs[] = {
> > > +    {
> > > +        .name         = "default",
> > > +        .type         = AVMEDIA_TYPE_VIDEO,
> > > +        .config_props = cudacrop_config_output,
> > > +    },
> > > +    { NULL }
> > > +};
> > > +
> > > +AVFilter ff_vf_crop_cuda = {
> > > +    .name      = "crop_cuda",
> > > +    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video
> > > crop"), +
> > > +    .init          = cudacrop_init,
> > > +    .uninit        = cudacrop_uninit,
> > > +    .query_formats = cudacrop_query_formats,
> > > +
> > > +    .priv_size = sizeof(CUDACropContext),
> > > +    .priv_class = &cudacrop_class,
> > > +
> > > +    .inputs    = cudacrop_inputs,
> > > +    .outputs   = cudacrop_outputs,
> > > +
> > > +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> > > +};
> > > diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu
> > > new file mode 100644
> > > index 0000000..4b94b73
> > > --- /dev/null
> > > +++ b/libavfilter/vf_crop_cuda.cu
> > > @@ -0,0 +1,109 @@
> > > +/*
> > > + * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person
> > > obtaining a
> > > + * copy of this software and associated documentation files (the
> > > "Software"),
> > > + * to deal in the Software without restriction, including without
> > > limitation
> > > + * the rights to use, copy, modify, merge, publish, distribute,
> > > sublicense,
> > > + * and/or sell copies of the Software, and to permit persons to whom
> > > the
> > > + * Software is furnished to do so, subject to the following
> > > conditions:
> > > + *
> > > + * The above copyright notice and this permission notice shall be
> > > included in
> > > + * all copies or substantial portions of the Software.
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > > EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > > MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> > > EVENT SHALL
> > > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
> > DAMAGES
> > > OR OTHER
> > > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > > ARISING
> > > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > > OTHER
> > > + * DEALINGS IN THE SOFTWARE.
> > > + */
> > > +
> > > +extern "C" {
> > > +
> > > +__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,
> > > +                           unsigned char *dst,
> > > +                           int dst_width, int dst_height, int
> > > dst_pitch,
> > > +                           int left, int top)
> > > +{
> > > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> > > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> > > +    int xi = xo + left;
> > > +    int yi = yo + top;
> > > +
> > > +    if (yo < dst_height && xo < dst_width)
> > > +        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned
> > > char>(uchar_tex, xi, yi); +}
> > > +
> > > +__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,
> > > +                            uchar2 *dst,
> > > +                            int dst_width, int dst_height, int
> > > dst_pitch,
> > > +                            int left, int top)
> > > +{
> > > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> > > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> > > +    int xi = xo + left;
> > > +    int yi = yo + top;
> > > +
> > > +    if (yo < dst_height && xo < dst_width)
> > > +        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex,
> > > xi, yi); +}
> > > +
> > > +__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,
> > > +                            uchar4 *dst,
> > > +                            int dst_width, int dst_height, int
> > > dst_pitch,
> > > +                            int left, int top)
> > > +{
> > > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> > > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> > > +    int xi = xo + left;
> > > +    int yi = yo + top;
> > > +
> > > +    if (yo < dst_height && xo < dst_width)
> > > +        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex,
> > > xi, yi); +}
> > > +
> > > +__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,
> > > +                            unsigned short *dst,
> > > +                            int dst_width, int dst_height, int
> > > dst_pitch,
> > > +                            int left, int top)
> > > +{
> > > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> > > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> > > +    int xi = xo + left;
> > > +    int yi = yo + top;
> > > +
> > > +    if (yo < dst_height && xo < dst_width)
> > > +        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned
> > > short>(ushort_tex, xi, yi); +}
> > > +
> > > +__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,
> > > +                             ushort2 *dst,
> > > +                             int dst_width, int dst_height, int
> > > dst_pitch,
> > > +                             int left, int top)
> > > +{
> > > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> > > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> > > +    int xi = xo + left;
> > > +    int yi = yo + top;
> > > +
> > > +    if (yo < dst_height && xo < dst_width)
> > > +        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex,
> > > xi, yi); +}
> > > +
> > > +__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,
> > > +                             ushort4 *dst,
> > > +                             int dst_width, int dst_height, int
> > > dst_pitch,
> > > +                             int left, int top)
> > > +{
> > > +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> > > +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> > > +    int xi = xo + left;
> > > +    int yi = yo + top;
> > > +
> > > +    if (yo < dst_height && xo < dst_width)
> > > +        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex,
> > > xi, yi); +}
> > > +
> > > +}
> >
> >
> >
> >
> > --phil
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Timo Rothenpieler March 25, 2019, 10:31 a.m. UTC | #8
On 25/03/2019 09:27, Tao Zhang wrote:
>>> Hi,
>>>
>>> Timo and Mark and I have been discussing this, and we think the right
>>> thing to do is add support to vf_scale_cuda to respect the crop
>>> properties on an input AVFrame. Mark posted a patch to vf_crop to
>>> ensure that the properties are set, and then the scale filter should
>>> respect those properties if they are set. You can look at
>>> vf_scale_vaapi for how the properties are read, but they will require
>>> explicit handling to adjust the src dimensions passed to the scale
>>> filter.
> Maybe a little not intuitive to users.
>>>
>>> This will be a more efficient way of handling crops, in terms of total
>>> lines of code and also allowing crop/scale with one less copy.
>>>
>>> I know this is quite different from the approach you've taken here, and
>>> we appreciate the work you've done, but it should be better overall to
>>> implement this integrated method.
>> Hi Philip,
>>
>> Glad to hear you guys had discussion on this. As I am also considering the problem, I have some questions about your idea.
>> So, what if user did not insert a scale_cuda after crop filter? Do you plan to automatically insert scale_cuda or just ignore the crop?
>> What if user want to do crop,transpose_cuda,scale_cuda? So we also need to handle crop inside transpose_cuda filter?
 >
> I have the same question.
Ideally, scale_cuda should be auto-inserted at the required places once 
it works that way.
Otherwise it seems pointless to me if the user still has to manually 
insert it after the generic filters setting metadata.

For that reason it should also still support getting its parameters 
passed directly as a fallback, and potentially even expose multiple 
filter names, so crop_cuda and transpose_cuda are still visible, but 
ultimately point to the same filter code.

We have a transpose_npp, right now, but with libnpp slowly being on its 
way out, transpose_cuda is needed, and ultimately even a format_cuda 
filter, since right now scale_npp is the only filter that can convert 
pixel formats on the hardware.
I'd also like to see scale_cuda to support a few more interpolation 
algorithms, but that's not very important for now.

All this functionality can be in the same filter, which is scale_cuda. 
The point of that is that it avoids needless expensive frame copies as 
much as possible.
leozhang March 26, 2019, 3:53 a.m. UTC | #9
Timo Rothenpieler <timo@rothenpieler.org> 于2019年3月25日周一 下午6:31写道:
>
> On 25/03/2019 09:27, Tao Zhang wrote:
> >>> Hi,
> >>>
> >>> Timo and Mark and I have been discussing this, and we think the right
> >>> thing to do is add support to vf_scale_cuda to respect the crop
> >>> properties on an input AVFrame. Mark posted a patch to vf_crop to
> >>> ensure that the properties are set, and then the scale filter should
> >>> respect those properties if they are set. You can look at
> >>> vf_scale_vaapi for how the properties are read, but they will require
> >>> explicit handling to adjust the src dimensions passed to the scale
> >>> filter.
> > Maybe a little not intuitive to users.
> >>>
> >>> This will be a more efficient way of handling crops, in terms of total
> >>> lines of code and also allowing crop/scale with one less copy.
> >>>
> >>> I know this is quite different from the approach you've taken here, and
> >>> we appreciate the work you've done, but it should be better overall to
> >>> implement this integrated method.
> >> Hi Philip,
> >>
> >> Glad to hear you guys had discussion on this. As I am also considering the problem, I have some questions about your idea.
> >> So, what if user did not insert a scale_cuda after crop filter? Do you plan to automatically insert scale_cuda or just ignore the crop?
> >> What if user want to do crop,transpose_cuda,scale_cuda? So we also need to handle crop inside transpose_cuda filter?
>  >
> > I have the same question.
> Ideally, scale_cuda should be auto-inserted at the required places once
> it works that way.
> Otherwise it seems pointless to me if the user still has to manually
> insert it after the generic filters setting metadata.
>
> For that reason it should also still support getting its parameters
> passed directly as a fallback, and potentially even expose multiple
> filter names, so crop_cuda and transpose_cuda are still visible, but
> ultimately point to the same filter code.
>
> We have a transpose_npp, right now, but with libnpp slowly being on its
> way out, transpose_cuda is needed, and ultimately even a format_cuda
> filter, since right now scale_npp is the only filter that can convert
> pixel formats on the hardware.
> I'd also like to see scale_cuda to support a few more interpolation
> algorithms, but that's not very important for now.
>
> All this functionality can be in the same filter, which is scale_cuda.
> The point of that is that it avoids needless expensive frame copies as
> much as possible.
I think frame copy is very low cost on cuda. Maybe the better way is
to create common functions for cuda related repeat code, but keep
frame detailed filtering seperate.
For example, if someone want to do bilateral filter with cuda, just
need focus on the filtering itself. True, scale_cuda can crop, however
hard to do bilateral filtering currently.
Thank you all for the discussion. If I have wrong, feel free to point it out.
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Ruiling Song March 26, 2019, 4:19 a.m. UTC | #10
> -----Original Message-----

> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of

> Timo Rothenpieler

> Sent: Monday, March 25, 2019 6:31 PM

> To: ffmpeg-devel@ffmpeg.org

> Subject: Re: [FFmpeg-devel] [PATCH][FFmpeg-devel v2] Add GPU accelerated

> video crop filter

> 

> On 25/03/2019 09:27, Tao Zhang wrote:

> >>> Hi,

> >>>

> >>> Timo and Mark and I have been discussing this, and we think the right

> >>> thing to do is add support to vf_scale_cuda to respect the crop

> >>> properties on an input AVFrame. Mark posted a patch to vf_crop to

> >>> ensure that the properties are set, and then the scale filter should

> >>> respect those properties if they are set. You can look at

> >>> vf_scale_vaapi for how the properties are read, but they will require

> >>> explicit handling to adjust the src dimensions passed to the scale

> >>> filter.

> > Maybe a little not intuitive to users.

> >>>

> >>> This will be a more efficient way of handling crops, in terms of total

> >>> lines of code and also allowing crop/scale with one less copy.

> >>>

> >>> I know this is quite different from the approach you've taken here, and

> >>> we appreciate the work you've done, but it should be better overall to

> >>> implement this integrated method.

> >> Hi Philip,

> >>

> >> Glad to hear you guys had discussion on this. As I am also considering the

> problem, I have some questions about your idea.

> >> So, what if user did not insert a scale_cuda after crop filter? Do you plan to

> automatically insert scale_cuda or just ignore the crop?

> >> What if user want to do crop,transpose_cuda,scale_cuda? So we also need

> to handle crop inside transpose_cuda filter?

>  >

> > I have the same question.

> Ideally, scale_cuda should be auto-inserted at the required places once

> it works that way.

> Otherwise it seems pointless to me if the user still has to manually

> insert it after the generic filters setting metadata.

Agree.

> 

> For that reason it should also still support getting its parameters

> passed directly as a fallback, and potentially even expose multiple

> filter names, so crop_cuda and transpose_cuda are still visible, but

> ultimately point to the same filter code.

> 

> We have a transpose_npp, right now, but with libnpp slowly being on its

> way out, transpose_cuda is needed, and ultimately even a format_cuda

> filter, since right now scale_npp is the only filter that can convert

> pixel formats on the hardware.

> I'd also like to see scale_cuda to support a few more interpolation

> algorithms, but that's not very important for now.

> 

> All this functionality can be in the same filter, which is scale_cuda.

> The point of that is that it avoids needless expensive frame copies as

> much as possible.


For crop/transpose, these are just some copy-like kernel. May be a good idea to merge with other kernels.
But I am not sure how much overall performance gain we would get for a transcoding pipeline. And merging all the things together may make code very complex.
For example, a crop+scale or crop+transpose may be easy to merge. But a crop+transpose+scale or crop+transpose+scale+format will be more complex.

I want to share some of my experience on developing opencl scale filter( https://patchwork.ffmpeg.org/patch/11910/ ).
I tried to merge scale and format-convert in one single OpenCL kernel.
But I failed to make the code clean after supporting interpolation method like bicubic, so I plan to separate them in two kernels these days.

And my experiments on scale_opencl show that merging scale with format-convert may not always get benefit.
For example, for 1080p scale-down, merging these two operations together is about 10% faster (for decode+scale), but for 4K input, merging two kernels make it slower.
My guess is different planes may compete the limited GPU cache. For scale-only, we can do it plane by plane, but for format-convert you have to read all the input planes and write all output planes at the same time.
This is just my guess, I have not root-caused what is the real reason. But I think keeping scale and format-convert in separate kernel function seems better.

I am also thinking about this issue other way, whether it is possible that we simple do the needed copy in crop/transpose and try to optimize off one filter if they are neighbors and pass the options to the other when configuring the filter pipeline?
Definitely I am really interested to see the work you described happen in FFmpeg.

Thanks!
Ruiling
diff mbox

Patch

diff --git a/Changelog b/Changelog
index ad7e82f..f224fc8 100644
--- a/Changelog
+++ b/Changelog
@@ -20,6 +20,7 @@  version <next>:
 - libaribb24 based ARIB STD-B24 caption support (profiles A and C)
 - Support decoding of HEVC 4:4:4 content in nvdec and cuviddec
 - removed libndi-newtek
+- crop_cuda GPU accelerated video crop filter
 
 
 version 4.1:
diff --git a/configure b/configure
index 331393f..3f3ac2f 100755
--- a/configure
+++ b/configure
@@ -2973,6 +2973,7 @@  qsvvpp_select="qsv"
 vaapi_encode_deps="vaapi"
 v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
 
+crop_cuda_filter_deps="ffnvcodec cuda_nvcc"
 hwupload_cuda_filter_deps="ffnvcodec"
 scale_npp_filter_deps="ffnvcodec libnpp"
 scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
diff --git a/doc/filters.texi b/doc/filters.texi
index 4ffb392..ee16a2d 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -7415,6 +7415,37 @@  If the specified expression is not valid, it is kept at its current
 value.
 @end table
 
+@section crop_cuda
+
+Crop the input video to given dimensions, implemented in CUDA.
+
+It accepts the following parameters:
+
+@table @option
+
+@item w
+The width of the output video. It defaults to @code{iw}.
+This expression is evaluated only once during the filter
+configuration.
+
+@item h
+The height of the output video. It defaults to @code{ih}.
+This expression is evaluated only once during the filter
+configuration.
+
+@item x
+The horizontal position, in the input video, of the left edge of the output
+video. It defaults to @code{(in_w-out_w)/2}.
+This expression is evaluated only once during the filter
+configuration.
+
+@item y
+The vertical position, in the input video, of the top edge of the output video.
+It defaults to @code{(in_h-out_h)/2}.
+This expression is evaluated only once during the filter
+configuration.
+@end table
+
 @section cropdetect
 
 Auto-detect the crop size.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index fef6ec5..84df037 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -187,6 +187,7 @@  OBJS-$(CONFIG_COPY_FILTER)                   += vf_copy.o
 OBJS-$(CONFIG_COREIMAGE_FILTER)              += vf_coreimage.o
 OBJS-$(CONFIG_COVER_RECT_FILTER)             += vf_cover_rect.o lavfutils.o
 OBJS-$(CONFIG_CROP_FILTER)                   += vf_crop.o
+OBJS-$(CONFIG_CROP_CUDA_FILTER)              += vf_crop_cuda.o vf_crop_cuda.ptx.o
 OBJS-$(CONFIG_CROPDETECT_FILTER)             += vf_cropdetect.o
 OBJS-$(CONFIG_CUE_FILTER)                    += f_cue.o
 OBJS-$(CONFIG_CURVES_FILTER)                 += vf_curves.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c51ae0f..550e545 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -175,6 +175,7 @@  extern AVFilter ff_vf_copy;
 extern AVFilter ff_vf_coreimage;
 extern AVFilter ff_vf_cover_rect;
 extern AVFilter ff_vf_crop;
+extern AVFilter ff_vf_crop_cuda;
 extern AVFilter ff_vf_cropdetect;
 extern AVFilter ff_vf_cue;
 extern AVFilter ff_vf_curves;
diff --git a/libavfilter/version.h b/libavfilter/version.h
index c71282c..5aa95f4 100644
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@ 
 
 #define LIBAVFILTER_VERSION_MAJOR   7
 #define LIBAVFILTER_VERSION_MINOR  48
-#define LIBAVFILTER_VERSION_MICRO 100
+#define LIBAVFILTER_VERSION_MICRO 101
 
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
                                                LIBAVFILTER_VERSION_MINOR, \
diff --git a/libavfilter/vf_crop_cuda.c b/libavfilter/vf_crop_cuda.c
new file mode 100644
index 0000000..fc6a2a6
--- /dev/null
+++ b/libavfilter/vf_crop_cuda.c
@@ -0,0 +1,638 @@ 
+/*
+* Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/eval.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+static const char *const var_names[] = {
+    "in_w", "iw",   ///< width  of the input video
+    "in_h", "ih",   ///< height of the input video
+    "out_w", "ow",  ///< width  of the cropped video
+    "out_h", "oh",  ///< height of the cropped video
+    "x",
+    "y",
+    NULL
+};
+
+enum var_name {
+    VAR_IN_W,  VAR_IW,
+    VAR_IN_H,  VAR_IH,
+    VAR_OUT_W, VAR_OW,
+    VAR_OUT_H, VAR_OH,
+    VAR_X,
+    VAR_Y,
+    VAR_VARS_NB
+};
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P016
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1))
+#define NUM_BUFFERS 2
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct CUDACropContext {
+    const AVClass *class;
+    AVCUDADeviceContext *hwctx;
+    enum AVPixelFormat in_fmt;
+    enum AVPixelFormat out_fmt;
+
+    struct {
+        int width;
+        int height;
+        int left;
+        int top;
+    } planes_in[3], planes_out[3];
+
+    AVBufferRef *frames_ctx;
+    AVFrame     *frame;
+
+    AVFrame *tmp_frame;
+    int passthrough;
+
+    /**
+     * Output sw format. AV_PIX_FMT_NONE for no conversion.
+     */
+    enum AVPixelFormat format;
+
+    int w,h,x,y;
+    char *w_expr, *h_expr, *x_expr, *y_expr;
+    double var_values[VAR_VARS_NB];
+
+    CUcontext   cu_ctx;
+    CUmodule    cu_module;
+    CUfunction  cu_func_uchar;
+    CUfunction  cu_func_uchar2;
+    CUfunction  cu_func_uchar4;
+    CUfunction  cu_func_ushort;
+    CUfunction  cu_func_ushort2;
+    CUfunction  cu_func_ushort4;
+    CUstream    cu_stream;
+
+    CUdeviceptr srcBuffer;
+    CUdeviceptr dstBuffer;
+    int         tex_alignment;
+} CUDACropContext;
+
+static av_cold int cudacrop_init(AVFilterContext *ctx)
+{
+    CUDACropContext *s = ctx->priv;
+
+    s->format = AV_PIX_FMT_NONE;
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void cudacrop_uninit(AVFilterContext *ctx)
+{
+    CUDACropContext *s = ctx->priv;
+
+    av_frame_free(&s->frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static int cudacrop_query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pixel_formats[] = {
+        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
+    };
+    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
+
+    return ff_set_common_formats(ctx, pix_fmts);
+}
+
+static av_cold int init_stage(CUDACropContext *s, AVBufferRef *device_ctx)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int in_sw, in_sh, out_sw, out_sh;
+    int ret, i;
+
+    av_pix_fmt_get_chroma_sub_sample(s->in_fmt,  &in_sw,  &in_sh);
+    av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh);
+    if (!s->planes_out[0].width) {
+        s->planes_out[0].width  = s->planes_in[0].width;
+        s->planes_out[0].height = s->planes_in[0].height;
+        s->planes_out[0].left = s->planes_in[0].left;
+        s->planes_out[0].top  = s->planes_in[0].top;
+    }
+
+    for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) {
+        s->planes_in[i].width   = s->planes_in[0].width   >> in_sw;
+        s->planes_in[i].height  = s->planes_in[0].height  >> in_sh;
+        s->planes_in[i].left    = s->planes_in[0].left    >> in_sw;
+        s->planes_in[i].top     = s->planes_in[0].top     >> in_sh;
+        s->planes_out[i].width  = s->planes_out[0].width  >> out_sw;
+        s->planes_out[i].height = s->planes_out[0].height >> out_sh;
+        s->planes_out[i].left   = 0;
+        s->planes_out[i].top    = 0;
+
+    }
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format    = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->out_fmt;
+    out_ctx->width     = FFALIGN(s->planes_out[0].width,  32);
+    out_ctx->height    = FFALIGN(s->planes_out[0].height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width  = s->planes_out[0].width;
+    s->frame->height = s->planes_out[0].height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height,
+                                         int out_width, int out_height,
+                                         int left, int top)
+{
+    CUDACropContext *s = ctx->priv;
+
+    AVHWFramesContext *in_frames_ctx;
+
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    int ret;
+
+    /* check that we have a hw context */
+    if (!ctx->inputs[0]->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
+    in_format     = in_frames_ctx->sw_format;
+    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    if (in_width == out_width && in_height == out_height)
+        s->passthrough = 1;
+
+    s->in_fmt = in_format;
+    s->out_fmt = out_format;
+
+    s->planes_in[0].width   = in_width;
+    s->planes_in[0].height  = in_height;
+    s->planes_in[0].left = left;
+    s->planes_in[0].top = top;
+    s->planes_out[0].width  = out_width;
+    s->planes_out[0].height = out_height;
+    s->planes_out[0].left = 0;
+    s->planes_out[0].top = 0;
+
+    ret = init_stage(s, in_frames_ctx->device_ref);
+    if (ret < 0)
+        return ret;
+
+    ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!ctx->outputs[0]->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static inline int normalize_double(int *n, double d)
+{
+    int ret = 0;
+
+    if (isnan(d))
+        ret = AVERROR(EINVAL);
+    else if (d > INT_MAX || d < INT_MIN) {
+        *n = d > INT_MAX ? INT_MAX : INT_MIN;
+        ret = AVERROR(EINVAL);
+    } else
+        *n = lrint(d);
+
+    return ret;
+}
+
+static av_cold int cudacrop_config_input(AVFilterLink *inlink)
+{
+    AVFilterContext *ctx = inlink->dst;
+    CUDACropContext *s = ctx->priv;
+    double res;
+    int ret;
+
+    s->var_values[VAR_IN_W] = s->var_values[VAR_IW] = inlink->w;
+    s->var_values[VAR_IN_H] = s->var_values[VAR_IH] = inlink->h;
+    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = NAN;
+    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = NAN;
+    s->var_values[VAR_X] = NAN;
+    s->var_values[VAR_Y] = NAN;
+    if ((ret = av_expr_parse_and_eval(&res, s->w_expr,
+                                      var_names, s->var_values,
+                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)
+        goto fail;
+    s->var_values[VAR_OUT_W] = s->var_values[VAR_OW] = res;
+    if ((ret = av_expr_parse_and_eval(&res, s->h_expr,
+                                      var_names, s->var_values,
+                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)
+        goto fail;
+    s->var_values[VAR_OUT_H] = s->var_values[VAR_OH] = res;
+    if ((ret = av_expr_parse_and_eval(&res, s->x_expr,
+                                      var_names, s->var_values,
+                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)
+        goto fail;
+    s->var_values[VAR_X] = res;
+    if ((ret = av_expr_parse_and_eval(&res, s->y_expr,
+                                      var_names, s->var_values,
+                                      NULL, NULL, NULL, NULL, NULL, 0, ctx)) < 0)
+        goto fail;
+    s->var_values[VAR_Y] = res;
+    if (normalize_double(&s->w, s->var_values[VAR_OW]) < 0 ||
+        normalize_double(&s->h, s->var_values[VAR_OH]) < 0 ||
+        normalize_double(&s->x, s->var_values[VAR_X]) < 0 ||
+        normalize_double(&s->y, s->var_values[VAR_Y]) < 0) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Too big value or invalid expression for out_w/ow or out_h/oh or x or y");
+        return AVERROR(EINVAL);
+    }
+
+fail:
+    return ret;
+}
+
+static av_cold int cudacrop_config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    CUDACropContext *s  = ctx->priv;
+    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
+    int ret;
+
+    extern char vf_crop_cuda_ptx[];
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_crop_cuda_ptx));
+    if (ret < 0)
+        goto fail;
+
+    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Crop_uchar"));
+    if (ret < 0)
+        goto fail;
+    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Crop_uchar2"));
+    if (ret < 0)
+        goto fail;
+    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Crop_uchar4"));
+    if (ret < 0)
+        goto fail;
+    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Crop_ushort"));
+    if (ret < 0)
+        goto fail;
+    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Crop_ushort2"));
+    if (ret < 0)
+        goto fail;
+    CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Crop_ushort4"));
+    if (ret < 0)
+        goto fail;
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    outlink->w = s->w;
+    outlink->h = s->h;
+
+    ret = init_processing_chain(ctx, inlink->w, inlink->h, s->w, s->h, s->x, s->y);
+    if (ret < 0)
+        return ret;
+
+    if (inlink->sample_aspect_ratio.num) {
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
+                                                             outlink->w*inlink->h},
+                                                inlink->sample_aspect_ratio);
+    } else {
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+    }
+
+    return 0;
+
+fail:
+    return ret;
+}
+
+static int call_crop_kernel(AVFilterContext *ctx, CUfunction func, int channels,
+                              uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
+                              uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
+                              int left, int top, int pixel_size)
+{
+    CUDACropContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
+    CUtexObject tex = 0;
+    void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height, &dst_pitch, &left, &top };
+    int ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_LINEAR,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = pixel_size == 1 ?
+                              CU_AD_FORMAT_UNSIGNED_INT8 :
+                              CU_AD_FORMAT_UNSIGNED_INT16,
+        .res.pitch2D.numChannels = channels,
+        .res.pitch2D.width = src_width,
+        .res.pitch2D.height = src_height,
+        .res.pitch2D.pitchInBytes = src_pitch,
+        .res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
+    };
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+    
+    ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+    return ret;
+}
+
+static int cropcuda_crop_internal(AVFilterContext *ctx,
+                            AVFrame *out, AVFrame *in)
+{
+    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data;
+    CUDACropContext *s = ctx->priv;
+
+    switch (in_frames_ctx->sw_format) {
+    case AV_PIX_FMT_YUV420P:
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0], in->width, in->height, in->linesize[0],
+                           out->data[0], out->width, out->height, out->linesize[0],
+                           s->planes_in[0].left, s->planes_in[0].top, 1);
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2, in->linesize[0]/2,
+                           out->data[0]+out->linesize[0]*out->height, out->width/2, out->height/2, out->linesize[0]/2,
+                           s->planes_in[1].left, s->planes_in[1].top, 1);
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2, in->height/2, in->linesize[0]/2,
+                           out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2, out->height/2, out->linesize[0]/2,
+                           s->planes_in[2].left, s->planes_in[2].top, 1);
+        break;
+    case AV_PIX_FMT_YUV444P:
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0], in->width, in->height, in->linesize[0],
+                           out->data[0], out->width, out->height, out->linesize[0],
+                           s->planes_in[0].left, s->planes_in[0].top, 1);
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0]+in->linesize[0]*in->height, in->width, in->height, in->linesize[0],
+                           out->data[0]+out->linesize[0]*out->height, out->width, out->height, out->linesize[0],
+                           s->planes_in[1].left, s->planes_in[1].top, 1);
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0]+in->linesize[0]*in->height*2, in->width, in->height, in->linesize[0],
+                           out->data[0]+out->linesize[0]*out->height*2, out->width, out->height, out->linesize[0],
+                           s->planes_in[2].left, s->planes_in[2].top, 1);
+        break;
+    case AV_PIX_FMT_NV12:
+        call_crop_kernel(ctx, s->cu_func_uchar, 1,
+                           in->data[0], in->width, in->height, in->linesize[0],
+                           out->data[0], out->width, out->height, out->linesize[0],
+                           s->planes_in[0].left, s->planes_in[0].top, 1);
+        call_crop_kernel(ctx, s->cu_func_uchar2, 2,
+                           in->data[1], in->width/2, in->height/2, in->linesize[1],
+                           out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width/2, out->height/2, out->linesize[1]/2,
+                           s->planes_in[1].left, s->planes_in[1].top, 1);
+        break;
+    case AV_PIX_FMT_P010LE:
+        call_crop_kernel(ctx, s->cu_func_ushort, 1,
+                           in->data[0], in->width, in->height, in->linesize[0]/2,
+                           out->data[0], out->width, out->height, out->linesize[0]/2,
+                           s->planes_in[0].left, s->planes_in[0].top, 2);
+        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
+                           in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2,
+                           out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4,
+                           s->planes_in[1].left, s->planes_in[1].top, 2);
+        break;
+    case AV_PIX_FMT_P016LE:
+        call_crop_kernel(ctx, s->cu_func_ushort, 1,
+                           in->data[0], in->width, in->height, in->linesize[0] / 2,
+                           out->data[0], out->width, out->height, out->linesize[0] / 2,
+                           s->planes_in[0].left, s->planes_in[0].top, 2);
+        call_crop_kernel(ctx, s->cu_func_ushort2, 2,
+                           in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2,
+                           out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4,
+                           s->planes_in[1].left, s->planes_in[1].top, 2);
+        break;
+    default:
+        return AVERROR_BUG;
+    }
+
+    return 0;
+}
+
+static int cudacrop_crop(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
+{
+    CUDACropContext *s = ctx->priv;
+    AVFrame *src = in;
+    int ret;
+
+    ret = cropcuda_crop_internal(ctx, s->frame, src);
+    if (ret < 0)
+        return ret;
+
+    src = s->frame;
+    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
+    if (ret < 0)
+        return ret;
+
+    av_frame_move_ref(out, s->frame);
+    av_frame_move_ref(s->frame, s->tmp_frame);
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int cudacrop_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext              *ctx = link->dst;
+    CUDACropContext              *s = ctx->priv;
+    AVFilterLink             *outlink = ctx->outputs[0];
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = cudacrop_crop(ctx, out, in);
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
+              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
+              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
+              INT_MAX);
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+#define OFFSET(x) offsetof(CUDACropContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption options[] = {
+    { "w",      "set the width crop area expression",  OFFSET(w_expr),     AV_OPT_TYPE_STRING, { .str = "iw"   }, .flags = FLAGS },
+    { "h",      "set the height crop area expression", OFFSET(h_expr),     AV_OPT_TYPE_STRING, { .str = "ih"   }, .flags = FLAGS },
+    { "x",      "set the x crop area expression",      OFFSET(x_expr),     AV_OPT_TYPE_STRING, { .str = "(in_w-out_w)/2"}, .flags = FLAGS },
+    { "y",      "set the y crop area expression",      OFFSET(y_expr),     AV_OPT_TYPE_STRING, { .str = "(in_h-out_h)/2"}, .flags = FLAGS },
+    { NULL },
+};
+
+static const AVClass cudacrop_class = {
+    .class_name = "cudacrop",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad cudacrop_inputs[] = {
+    {
+        .name        = "default",
+        .type        = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = cudacrop_filter_frame,
+        .config_props = cudacrop_config_input,
+    },
+    { NULL }
+};
+
+static const AVFilterPad cudacrop_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = cudacrop_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_crop_cuda = {
+    .name      = "crop_cuda",
+    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video crop"),
+
+    .init          = cudacrop_init,
+    .uninit        = cudacrop_uninit,
+    .query_formats = cudacrop_query_formats,
+
+    .priv_size = sizeof(CUDACropContext),
+    .priv_class = &cudacrop_class,
+
+    .inputs    = cudacrop_inputs,
+    .outputs   = cudacrop_outputs,
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_crop_cuda.cu b/libavfilter/vf_crop_cuda.cu
new file mode 100644
index 0000000..4b94b73
--- /dev/null
+++ b/libavfilter/vf_crop_cuda.cu
@@ -0,0 +1,109 @@ 
+/*
+ * Copyright (c) 2019, iQIYI CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+extern "C" {
+
+__global__ void Crop_uchar(cudaTextureObject_t uchar_tex,
+                           unsigned char *dst,
+                           int dst_width, int dst_height, int dst_pitch,
+                           int left, int top)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    int xi = xo + left;
+    int yi = yo + top;
+
+    if (yo < dst_height && xo < dst_width)
+        dst[yo*dst_pitch+xo] = (unsigned char) tex2D<unsigned char>(uchar_tex, xi, yi);
+}
+
+__global__ void Crop_uchar2(cudaTextureObject_t uchar2_tex,
+                            uchar2 *dst,
+                            int dst_width, int dst_height, int dst_pitch,
+                            int left, int top)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    int xi = xo + left;
+    int yi = yo + top;
+
+    if (yo < dst_height && xo < dst_width)
+        dst[yo*dst_pitch+xo] = (uchar2) tex2D<uchar2>(uchar2_tex, xi, yi);
+}
+
+__global__ void Crop_uchar4(cudaTextureObject_t uchar4_tex,
+                            uchar4 *dst,
+                            int dst_width, int dst_height, int dst_pitch,
+                            int left, int top)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    int xi = xo + left;
+    int yi = yo + top;
+
+    if (yo < dst_height && xo < dst_width)
+        dst[yo*dst_pitch+xo] = (uchar4) tex2D<uchar4>(uchar4_tex, xi, yi);
+}
+
+__global__ void Crop_ushort(cudaTextureObject_t ushort_tex,
+                            unsigned short *dst,
+                            int dst_width, int dst_height, int dst_pitch,
+                            int left, int top)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    int xi = xo + left;
+    int yi = yo + top;
+
+    if (yo < dst_height && xo < dst_width)
+        dst[yo*dst_pitch+xo] = (unsigned short) tex2D<unsigned short>(ushort_tex, xi, yi);
+}
+
+__global__ void Crop_ushort2(cudaTextureObject_t ushort2_tex,
+                             ushort2 *dst,
+                             int dst_width, int dst_height, int dst_pitch,
+                             int left, int top)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    int xi = xo + left;
+    int yi = yo + top;
+
+    if (yo < dst_height && xo < dst_width)
+        dst[yo*dst_pitch+xo] = (ushort2) tex2D<ushort2>(ushort2_tex, xi, yi);
+}
+
+__global__ void Crop_ushort4(cudaTextureObject_t ushort4_tex,
+                             ushort4 *dst,
+                             int dst_width, int dst_height, int dst_pitch,
+                             int left, int top)
+{
+    int xo = blockIdx.x * blockDim.x + threadIdx.x;
+    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+    int xi = xo + left;
+    int yi = yo + top;
+
+    if (yo < dst_height && xo < dst_width)
+        dst[yo*dst_pitch+xo] = (ushort4) tex2D<ushort4>(ushort4_tex, xi, yi);
+}
+
+}