diff mbox series

[FFmpeg-devel] avfilter: add sharpen_npp video filter

Message ID BN9PR12MB52748C67771A57F5C9B3FECBD2D99@BN9PR12MB5274.namprd12.prod.outlook.com
State New
Headers show
Series [FFmpeg-devel] avfilter: add sharpen_npp video filter | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Roman Arzumanyan Sept. 13, 2021, 8:57 a.m. UTC
This patch adds simple sharpening filter which is accelerated by NPP. CLI sample:


./ffmpeg \

  -hwaccel cuda -hwaccel_output_format cuda \

  -i ./input.mp4 \

  -vf sharpen_npp \

  -c:v hevc_nvenc \

  -y ./output_sharp.mp4
Subject: [PATCH] sharpen_npp video filter added

---
 configure                    |   5 +-
 libavfilter/Makefile         |   1 +
 libavfilter/allfilters.c     |   1 +
 libavfilter/vf_sharpen_npp.c | 530 +++++++++++++++++++++++++++++++++++
 4 files changed, 535 insertions(+), 2 deletions(-)
 create mode 100644 libavfilter/vf_sharpen_npp.c

Comments

Timo Rothenpieler Sept. 28, 2021, 7:58 p.m. UTC | #1
> From 0df6297bd3664beb05c813c5fc62852e61616fa9 Mon Sep 17 00:00:00 2001
> From: Roman Arzumanyan <rarzumanyan@nvidia.com>
> Date: Mon, 6 Sep 2021 14:26:27 +0300
> Subject: [PATCH] sharpen_npp video filter added

Same as the other patch, should be as the mail subject.

> ---
>  configure                    |   5 +-
>  libavfilter/Makefile         |   1 +
>  libavfilter/allfilters.c     |   1 +
>  libavfilter/vf_sharpen_npp.c | 530 +++++++++++++++++++++++++++++++++++

Missing entry in filter docs.

Missing avfilter minor version bump.

>  4 files changed, 535 insertions(+), 2 deletions(-)
>  create mode 100644 libavfilter/vf_sharpen_npp.c
> 
> diff --git a/configure b/configure
> index af410a9d11..e092cc8c67 100755
> --- a/configure
> +++ b/configure
> @@ -3094,6 +3094,7 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
>  transpose_npp_filter_deps="ffnvcodec libnpp"
>  overlay_cuda_filter_deps="ffnvcodec"
>  overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
> +sharpen_npp_filter_deps="ffnvcodec libnpp"
>  
>  amf_deps_any="libdl LoadLibrary"
>  nvenc_deps="ffnvcodec"
> @@ -6443,8 +6444,8 @@ enabled libmodplug        && require_pkg_config libmodplug libmodplug libmodplug
>  enabled libmp3lame        && require "libmp3lame >= 3.98.3" lame/lame.h lame_set_VBR_quality -lmp3lame $libm_extralibs
>  enabled libmysofa         && { check_pkg_config libmysofa libmysofa mysofa.h mysofa_neighborhood_init_withstepdefine ||
>                                 require libmysofa mysofa.h mysofa_neighborhood_init_withstepdefine -lmysofa $zlib_extralibs; }
> -enabled libnpp            && { check_lib libnpp npp.h nppGetLibVersion -lnppig -lnppicc -lnppc -lnppidei ||
> -                               check_lib libnpp npp.h nppGetLibVersion -lnppi -lnppc -lnppidei ||
> +enabled libnpp            && { check_lib libnpp npp.h nppGetLibVersion -lnppig -lnppicc -lnppc -lnppidei -lnppif ||
> +                               check_lib libnpp npp.h nppGetLibVersion -lnppi -lnppif -lnppc -lnppidei ||
>                                 die "ERROR: libnpp not found"; }

Was wondering if it's worth it to split the new dep out into a separate 
check, but it's probably not and just fine to pull along, even if 
sharpen_npp were to be disabled.

>  enabled libopencore_amrnb && require libopencore_amrnb opencore-amrnb/interf_dec.h Decoder_Interface_init -lopencore-amrnb
>  enabled libopencore_amrwb && require libopencore_amrwb opencore-amrwb/dec_if.h D_IF_init -lopencore-amrwb
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index af957a5ac0..330ddfe5d5 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -423,6 +423,7 @@ OBJS-$(CONFIG_SETRANGE_FILTER)               += vf_setparams.o
>  OBJS-$(CONFIG_SETSAR_FILTER)                 += vf_aspect.o
>  OBJS-$(CONFIG_SETTB_FILTER)                  += settb.o
>  OBJS-$(CONFIG_SHARPNESS_VAAPI_FILTER)        += vf_misc_vaapi.o vaapi_vpp.o
> +OBJS-$(CONFIG_SHARPEN_NPP_FILTER)            += vf_sharpen_npp.o

Should be above SHARPNESS_VAAPI if strictly following alphabetic ordering.

>  OBJS-$(CONFIG_SHEAR_FILTER)                  += vf_shear.o
>  OBJS-$(CONFIG_SHOWINFO_FILTER)               += vf_showinfo.o
>  OBJS-$(CONFIG_SHOWPALETTE_FILTER)            += vf_showpalette.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 0c6b2347c8..e50e5f3b6a 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -404,6 +404,7 @@ extern const AVFilter ff_vf_setrange;
>  extern const AVFilter ff_vf_setsar;
>  extern const AVFilter ff_vf_settb;
>  extern const AVFilter ff_vf_sharpness_vaapi;
> +extern const AVFilter ff_vf_sharpen_npp;

Same here about the order.

>  extern const AVFilter ff_vf_shear;
>  extern const AVFilter ff_vf_showinfo;
>  extern const AVFilter ff_vf_showpalette;
> diff --git a/libavfilter/vf_sharpen_npp.c b/libavfilter/vf_sharpen_npp.c
> new file mode 100644
> index 0000000000..85549c36d0
> --- /dev/null
> +++ b/libavfilter/vf_sharpen_npp.c
> @@ -0,0 +1,530 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * NPP sharpen video filter
> + */
> +
> +#include <nppi.h>
> +#include <nppi_filtering_functions.h>
> +#include <stdio.h>
> +#include <string.h>
> +
> +#include "libavutil/avstring.h"
> +#include "libavutil/common.h"
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#include "libavutil/cuda_check.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "scale_eval.h"

Unused header

> +#include "video.h"
> +
> +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, device_hwctx->internal->cuda_dl, x)
> +
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_NV12,
> +    AV_PIX_FMT_YUV444P,
> +};
> +
> +static const enum AVPixelFormat deinterleaved_formats[][2] = {
> +    { AV_PIX_FMT_NV12, AV_PIX_FMT_YUV420P },
> +};
> +
> +enum SharpenStage {
> +    STAGE_DEINTERLEAVE,
> +    STAGE_SHARPEN,
> +    STAGE_INTERLEAVE,
> +    STAGE_NB,
> +};
> +
> +typedef struct NPPSharpenStageContext {
> +    int stage_needed;
> +    enum AVPixelFormat in_fmt;
> +    enum AVPixelFormat out_fmt;
> +
> +    struct {
> +        int width;
> +        int height;
> +    } planes_in[3], planes_out[3];
> +
> +    AVBufferRef *frames_ctx;
> +    AVFrame     *frame;
> +} NPPSharpenStageContext;
> +
> +typedef struct NPPSharpenContext {
> +    const AVClass *class;
> +
> +    NPPSharpenStageContext stages[STAGE_NB];
> +    AVFrame *tmp_frame;
> +
> +    /**
> +     * Output sw format. AV_PIX_FMT_NONE for no conversion.
> +     */
> +    enum AVPixelFormat format;
> +
> +    /**
> +     * Width, height and pixel format strings;
> +     */
> +    char *w_expr;
> +    char *h_expr;
> +    char *format_str;    
> +
> +} NPPSharpenContext;
> +
> +static int nppsharpen_init(AVFilterContext *ctx)
> +{
> +    NPPSharpenContext *s = ctx->priv;
> +    int i;
> +
> +    if (!strcmp(s->format_str, "same")) {
> +        s->format = AV_PIX_FMT_NONE;
> +    } else {
> +        s->format = av_get_pix_fmt(s->format_str);
> +        if (s->format == AV_PIX_FMT_NONE) {
> +            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
> +            return AVERROR(EINVAL);
> +        }
> +    }
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
> +        s->stages[i].frame = av_frame_alloc();
> +        if (!s->stages[i].frame)
> +            return AVERROR(ENOMEM);
> +    }
> +    s->tmp_frame = av_frame_alloc();
> +    if (!s->tmp_frame)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static void nppsharpen_uninit(AVFilterContext *ctx)
> +{
> +    NPPSharpenContext              *s = ctx->priv;
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
> +        av_frame_free(&s->stages[i].frame);
> +        av_buffer_unref(&s->stages[i].frames_ctx);
> +    }
> +    av_frame_free(&s->tmp_frame);
> +}
> +
> +static int nppsharpen_query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pixel_formats[] = {
> +        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
> +    };
> +    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
> +
> +    return ff_set_common_formats(ctx, pix_fmts);
> +}
> +
> +static int init_stage(NPPSharpenStageContext *stage, AVBufferRef *device_ctx)
> +{
> +    AVBufferRef *out_ref = NULL;
> +    AVHWFramesContext *out_ctx;
> +    int in_sw, in_sh, out_sw, out_sh;
> +    int ret, i;
> +
> +    av_pix_fmt_get_chroma_sub_sample(stage->in_fmt,  &in_sw,  &in_sh);
> +    av_pix_fmt_get_chroma_sub_sample(stage->out_fmt, &out_sw, &out_sh);
> +    if (!stage->planes_out[0].width) {
> +        stage->planes_out[0].width  = stage->planes_in[0].width;
> +        stage->planes_out[0].height = stage->planes_in[0].height;
> +    }
> +
> +    for (i = 1; i < FF_ARRAY_ELEMS(stage->planes_in); i++) {
> +        stage->planes_in[i].width   = stage->planes_in[0].width   >> in_sw;
> +        stage->planes_in[i].height  = stage->planes_in[0].height  >> in_sh;
> +        stage->planes_out[i].width  = stage->planes_out[0].width  >> out_sw;
> +        stage->planes_out[i].height = stage->planes_out[0].height >> out_sh;
> +    }
> +
> +    out_ref = av_hwframe_ctx_alloc(device_ctx);
> +    if (!out_ref)
> +        return AVERROR(ENOMEM);
> +    out_ctx = (AVHWFramesContext*)out_ref->data;
> +
> +    out_ctx->format    = AV_PIX_FMT_CUDA;
> +    out_ctx->sw_format = stage->out_fmt;
> +    out_ctx->width     = FFALIGN(stage->planes_out[0].width,  32);
> +    out_ctx->height    = FFALIGN(stage->planes_out[0].height, 32);
> +
> +    ret = av_hwframe_ctx_init(out_ref);
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_frame_unref(stage->frame);
> +    ret = av_hwframe_get_buffer(out_ref, stage->frame, 0);
> +    if (ret < 0)
> +        goto fail;
> +
> +    stage->frame->width  = stage->planes_out[0].width;
> +    stage->frame->height = stage->planes_out[0].height;
> +
> +    av_buffer_unref(&stage->frames_ctx);
> +    stage->frames_ctx = out_ref;
> +
> +    return 0;
> +fail:
> +    av_buffer_unref(&out_ref);
> +    return ret;
> +}
> +
> +static int format_is_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +static enum AVPixelFormat get_deinterleaved_format(enum AVPixelFormat fmt)
> +{
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
> +    int i, planes;
> +
> +    planes = av_pix_fmt_count_planes(fmt);
> +    if (planes == desc->nb_components)
> +        return fmt;
> +    for (i = 0; i < FF_ARRAY_ELEMS(deinterleaved_formats); i++)
> +        if (deinterleaved_formats[i][0] == fmt)
> +            return deinterleaved_formats[i][1];
> +    return AV_PIX_FMT_NONE;
> +}
> +
> +static int init_processing_chain(AVFilterContext *ctx, int width, int height)
> +{
> +    NPPSharpenContext *s = ctx->priv;
> +    AVHWFramesContext *in_frames_ctx;
> +
> +    enum AVPixelFormat in_format;
> +    enum AVPixelFormat out_format;
> +    enum AVPixelFormat in_deinterleaved_format;
> +    enum AVPixelFormat out_deinterleaved_format;
> +
> +    int i, ret, last_stage = -1;
> +
> +    /* check that we have a hw context */
> +    if (!ctx->inputs[0]->hw_frames_ctx) {
> +        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
> +        return AVERROR(EINVAL);
> +    }
> +    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
> +    in_format     = in_frames_ctx->sw_format;
> +    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format;
> +
> +    if (!format_is_supported(in_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
> +               av_get_pix_fmt_name(in_format));
> +        return AVERROR(ENOSYS);
> +    }
> +    if (!format_is_supported(out_format)) {
> +        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
> +               av_get_pix_fmt_name(out_format));
> +        return AVERROR(ENOSYS);
> +    }
> +
> +    in_deinterleaved_format  = get_deinterleaved_format(in_format);
> +    out_deinterleaved_format = get_deinterleaved_format(out_format);
> +    if (in_deinterleaved_format  == AV_PIX_FMT_NONE ||
> +        out_deinterleaved_format == AV_PIX_FMT_NONE)
> +        return AVERROR_BUG;
> +
> +    /* figure out which stages need to be done */
> +    s->stages[STAGE_SHARPEN].stage_needed = 1;
> +    if (in_format != in_deinterleaved_format)
> +        s->stages[STAGE_DEINTERLEAVE].stage_needed = 1;
> +    if (out_format != out_deinterleaved_format)
> +        s->stages[STAGE_INTERLEAVE].stage_needed = 1;
> +
> +    s->stages[STAGE_DEINTERLEAVE].in_fmt              = in_format;
> +    s->stages[STAGE_DEINTERLEAVE].out_fmt             = in_deinterleaved_format;
> +    s->stages[STAGE_DEINTERLEAVE].planes_in[0].width  = width;
> +    s->stages[STAGE_DEINTERLEAVE].planes_in[0].height = height;
> +
> +    s->stages[STAGE_SHARPEN].in_fmt               = in_deinterleaved_format;
> +    s->stages[STAGE_SHARPEN].out_fmt              = out_deinterleaved_format;
> +    s->stages[STAGE_SHARPEN].planes_in[0].width   = width;
> +    s->stages[STAGE_SHARPEN].planes_in[0].height  = height;
> +    s->stages[STAGE_SHARPEN].planes_out[0].width  = width;
> +    s->stages[STAGE_SHARPEN].planes_out[0].height = height;
> +
> +    s->stages[STAGE_INTERLEAVE].in_fmt              = out_deinterleaved_format;
> +    s->stages[STAGE_INTERLEAVE].out_fmt             = out_format;
> +    s->stages[STAGE_INTERLEAVE].planes_in[0].width  = width;
> +    s->stages[STAGE_INTERLEAVE].planes_in[0].height = height;
> +
> +    /* init the hardware contexts */
> +    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
> +        if (!s->stages[i].stage_needed)
> +            continue;
> +
> +        ret = init_stage(&s->stages[i], in_frames_ctx->device_ref);
> +        if (ret < 0)
> +            return ret;
> +
> +        last_stage = i;
> +    }
> +
> +    if (last_stage >= 0)
> +        ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->stages[last_stage].frames_ctx);
> +    else
> +        ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(ctx->inputs[0]->hw_frames_ctx);
> +
> +    if (!ctx->outputs[0]->hw_frames_ctx)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static int nppsharpen_config_props(AVFilterLink *outlink)
> +{
> +    AVFilterContext *ctx = outlink->src;
> +    AVFilterLink *inlink = outlink->src->inputs[0];
> +    int ret;
> +
> +    outlink->w = inlink->w;
> +    outlink->h = inlink->h;
> +
> +    ret = init_processing_chain(ctx, inlink->w, inlink->h);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (inlink->sample_aspect_ratio.num)
> +        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
> +                                                             outlink->w*inlink->h},
> +                                                inlink->sample_aspect_ratio);
> +    else
> +        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
> +
> +    return ret;
> +}
> +
> +static int nppsharpen_deinterleave(AVFilterContext *ctx, NPPSharpenStageContext *stage,
> +                                 AVFrame *out, AVFrame *in)
> +{
> +    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data;
> +    NppStatus err;
> +
> +    switch (in_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_NV12:
> +        err = nppiYCbCr420_8u_P2P3R(in->data[0], in->linesize[0],
> +                                    in->data[1], in->linesize[1],
> +                                    out->data, out->linesize,
> +                                    (NppiSize){ in->width, in->height });
> +        break;
> +    default:
> +        return AVERROR_BUG;
> +    }
> +    if (err != NPP_SUCCESS) {
> +        av_log(ctx, AV_LOG_ERROR, "NPP deinterleave error: %d\n", err);
> +        return AVERROR_UNKNOWN;
> +    }
> +
> +    return 0;
> +}
> +
> +static int nppsharpen_apply_filter(AVFilterContext *ctx, NPPSharpenStageContext *stage,
> +                           AVFrame *out, AVFrame *in)
> +{
> +    NppStatus err;
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) {
> +        int ow = stage->planes_out[i].width;
> +        int oh = stage->planes_out[i].height;
> +
> +        err = nppiFilterSharpenBorder_8u_C1R(in->data[i], in->linesize[i], 
> +                                             (NppiSize){ow, oh}, (NppiPoint){0, 0},
> +                                             out->data[i], out->linesize[i], 
> +                                             (NppiSize){ow, oh},
> +                                             NPP_BORDER_REPLICATE);
> +        if (err != NPP_SUCCESS) {
> +            av_log(ctx, AV_LOG_ERROR, "NPP sharpen error: %d\n", err);
> +            return AVERROR_UNKNOWN;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int nppsharpen_interleave(AVFilterContext *ctx, NPPSharpenStageContext *stage,
> +                               AVFrame *out, AVFrame *in)
> +{
> +    AVHWFramesContext *out_frames_ctx = (AVHWFramesContext*)out->hw_frames_ctx->data;
> +    NppStatus err;
> +
> +    switch (out_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_NV12:
> +        err = nppiYCbCr420_8u_P3P2R((const uint8_t**)in->data,
> +                                    in->linesize,
> +                                    out->data[0], out->linesize[0],
> +                                    out->data[1], out->linesize[1],
> +                                    (NppiSize){ in->width, in->height });
> +        break;
> +    default:
> +        return AVERROR_BUG;
> +    }
> +    if (err != NPP_SUCCESS) {
> +        av_log(ctx, AV_LOG_ERROR, "NPP deinterleave error: %d\n", err);
> +        return AVERROR_UNKNOWN;
> +    }
> +
> +    return 0;
> +}
> +
> +static int (*const nppsharpen_process[])(AVFilterContext *ctx, NPPSharpenStageContext *stage,
> +                                       AVFrame *out, AVFrame *in) = {
> +    [STAGE_DEINTERLEAVE] = nppsharpen_deinterleave,
> +    [STAGE_SHARPEN]      = nppsharpen_apply_filter,
> +    [STAGE_INTERLEAVE]   = nppsharpen_interleave,
> +};
> +
> +static int nppsharpen_sharpen(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
> +{
> +    NPPSharpenContext *s = ctx->priv;
> +    AVFrame *src = in;
> +    int i, ret, last_stage = -1;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
> +        if (!s->stages[i].stage_needed)
> +            continue;
> +
> +        ret = nppsharpen_process[i](ctx, &s->stages[i], s->stages[i].frame, src);
> +        if (ret < 0)
> +            return ret;
> +
> +        src        = s->stages[i].frame;
> +        last_stage = i;
> +    }
> +
> +    if (last_stage < 0)
> +        return AVERROR_BUG;
> +    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
> +    if (ret < 0)
> +        return ret;
> +
> +    av_frame_move_ref(out, src);
> +    av_frame_move_ref(src, s->tmp_frame);
> +
> +    ret = av_frame_copy_props(out, in);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int nppsharpen_filter_frame(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext              *ctx = link->dst;
> +    AVFilterLink             *outlink = ctx->outputs[0];
> +    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)outlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
> +
> +    AVFrame *out = NULL;
> +    CUcontext dummy;
> +    int ret = 0;
> +
> +    out = av_frame_alloc();
> +    if (!out) {
> +        ret = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    ret = CHECK_CU(device_hwctx->internal->cuda_dl->cuCtxPushCurrent(device_hwctx->cuda_ctx));
> +    if (ret < 0)
> +        goto fail;
> +
> +    ret = nppsharpen_sharpen(ctx, out, in);
> +
> +    CHECK_CU(device_hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
> +    if (ret < 0)
> +        goto fail;
> +
> +    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
> +              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
> +              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
> +              INT_MAX);
> +
> +    av_frame_free(&in);
> +    return ff_filter_frame(outlink, out);
> +fail:
> +    av_frame_free(&in);
> +    av_frame_free(&out);
> +    return ret;
> +}
> +
> +#define OFFSET(x) offsetof(NPPSharpenContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption options[] = {
> +    { "format",     "Output pixel format.", OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
> +    { NULL },
> +};
> +
> +static const AVClass nppsharpen_class = {
> +    .class_name = "nppsharpen",
> +    .item_name  = av_default_item_name,
> +    .option     = options,
> +    .version    = LIBAVUTIL_VERSION_INT,
> +};
> +
> +static const AVFilterPad nppsharpen_inputs[] = {
> +    {
> +        .name        = "default",
> +        .type        = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = nppsharpen_filter_frame,
> +    }
> +};
> +
> +static const AVFilterPad nppsharpen_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = nppsharpen_config_props,
> +    }
> +};
> +
> +const AVFilter ff_vf_sharpen_npp = {
> +    .name      = "sharpen_npp",
> +    .description = NULL_IF_CONFIG_SMALL("NVIDIA Performance Primitives video "
> +                                        "sharpening filter."),
> +
> +    .init          = nppsharpen_init,
> +    .uninit        = nppsharpen_uninit,
> +    .query_formats = nppsharpen_query_formats,
> +
> +    .priv_size = sizeof(NPPSharpenContext),
> +    .priv_class = &nppsharpen_class,
> +
> +    FILTER_INPUTS(nppsharpen_inputs),
> +    FILTER_OUTPUTS(nppsharpen_outputs),
> +
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> -- 
> 2.25.1
> 

A general point about this, but also other npp filters:
Is it really worth it to have the de/interlave stages in all of them?
The filter would be a lot more simple without it, and you can just put 
scale_cuda or scale_npp in front to convert to a planar format, and 
nvenc can also then take it as input, eliminating the re-interleaving 
step entirely.

Even more generally, this filter shares a lot of code with scale_npp. To 
the point where it should be possible to combine then, and only swap out 
the middle stage based on which filter is in use.
Couldn't this whole filter just be an option to scale_npp, which turns 
on sharpening, instead of a whole other filter?
The scaling stages are already skipped if they'd be a no-op, and 
sharpening could just be another optional stage.
Roman Arzumanyan Sept. 30, 2021, 6:51 p.m. UTC | #2
Thanks for review, Timo.

Please find revised patch attached.
(de)interlacing is now removed, doc entry was added as well.
Timo Rothenpieler Oct. 7, 2021, 4:14 p.m. UTC | #3
On 30.09.2021 20:51, Roman Arzumanyan wrote:
> Thanks for review, Timo.
> 
> Please find revised patch attached.
> (de)interlacing is now removed, doc entry was added as well.

applied with a few minor amendments, thanks!


While testing this via "-vf scale_npp=format=yuv420p,sharpen_npp" I 
discovered a weird issue where the last line of pixels was pink/purple, 
apparently missing the luma plane.

This turned out to be a bug in scale_npp, which returned frames with a 
height of 1088 instead of 1080 for a 1080p source.
It was fairly simple to fix, and I also pushed that fix: 
http://git.videolan.org/?p=ffmpeg.git;a=commit;h=37745b49049d4dbb5aa4ea84bcc289ed511971f9
I'm just very surprised that nobody noticed this issue in such a long time.



Timo
diff mbox series

Patch

diff --git a/configure b/configure
index af410a9d11..e092cc8c67 100755
--- a/configure
+++ b/configure
@@ -3094,6 +3094,7 @@  thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+sharpen_npp_filter_deps="ffnvcodec libnpp"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
@@ -6443,8 +6444,8 @@  enabled libmodplug        && require_pkg_config libmodplug libmodplug libmodplug
 enabled libmp3lame        && require "libmp3lame >= 3.98.3" lame/lame.h lame_set_VBR_quality -lmp3lame $libm_extralibs
 enabled libmysofa         && { check_pkg_config libmysofa libmysofa mysofa.h mysofa_neighborhood_init_withstepdefine ||
                                require libmysofa mysofa.h mysofa_neighborhood_init_withstepdefine -lmysofa $zlib_extralibs; }
-enabled libnpp            && { check_lib libnpp npp.h nppGetLibVersion -lnppig -lnppicc -lnppc -lnppidei ||
-                               check_lib libnpp npp.h nppGetLibVersion -lnppi -lnppc -lnppidei ||
+enabled libnpp            && { check_lib libnpp npp.h nppGetLibVersion -lnppig -lnppicc -lnppc -lnppidei -lnppif ||
+                               check_lib libnpp npp.h nppGetLibVersion -lnppi -lnppif -lnppc -lnppidei ||
                                die "ERROR: libnpp not found"; }
 enabled libopencore_amrnb && require libopencore_amrnb opencore-amrnb/interf_dec.h Decoder_Interface_init -lopencore-amrnb
 enabled libopencore_amrwb && require libopencore_amrwb opencore-amrwb/dec_if.h D_IF_init -lopencore-amrwb
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index af957a5ac0..330ddfe5d5 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -423,6 +423,7 @@  OBJS-$(CONFIG_SETRANGE_FILTER)               += vf_setparams.o
 OBJS-$(CONFIG_SETSAR_FILTER)                 += vf_aspect.o
 OBJS-$(CONFIG_SETTB_FILTER)                  += settb.o
 OBJS-$(CONFIG_SHARPNESS_VAAPI_FILTER)        += vf_misc_vaapi.o vaapi_vpp.o
+OBJS-$(CONFIG_SHARPEN_NPP_FILTER)            += vf_sharpen_npp.o
 OBJS-$(CONFIG_SHEAR_FILTER)                  += vf_shear.o
 OBJS-$(CONFIG_SHOWINFO_FILTER)               += vf_showinfo.o
 OBJS-$(CONFIG_SHOWPALETTE_FILTER)            += vf_showpalette.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 0c6b2347c8..e50e5f3b6a 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -404,6 +404,7 @@  extern const AVFilter ff_vf_setrange;
 extern const AVFilter ff_vf_setsar;
 extern const AVFilter ff_vf_settb;
 extern const AVFilter ff_vf_sharpness_vaapi;
+extern const AVFilter ff_vf_sharpen_npp;
 extern const AVFilter ff_vf_shear;
 extern const AVFilter ff_vf_showinfo;
 extern const AVFilter ff_vf_showpalette;
diff --git a/libavfilter/vf_sharpen_npp.c b/libavfilter/vf_sharpen_npp.c
new file mode 100644
index 0000000000..85549c36d0
--- /dev/null
+++ b/libavfilter/vf_sharpen_npp.c
@@ -0,0 +1,530 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * NPP sharpen video filter
+ */
+
+#include <nppi.h>
+#include <nppi_filtering_functions.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "formats.h"
+#include "internal.h"
+#include "scale_eval.h"
+#include "video.h"
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, device_hwctx->internal->cuda_dl, x)
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+};
+
+static const enum AVPixelFormat deinterleaved_formats[][2] = {
+    { AV_PIX_FMT_NV12, AV_PIX_FMT_YUV420P },
+};
+
+enum SharpenStage {
+    STAGE_DEINTERLEAVE,
+    STAGE_SHARPEN,
+    STAGE_INTERLEAVE,
+    STAGE_NB,
+};
+
+typedef struct NPPSharpenStageContext {
+    int stage_needed;
+    enum AVPixelFormat in_fmt;
+    enum AVPixelFormat out_fmt;
+
+    struct {
+        int width;
+        int height;
+    } planes_in[3], planes_out[3];
+
+    AVBufferRef *frames_ctx;
+    AVFrame     *frame;
+} NPPSharpenStageContext;
+
+typedef struct NPPSharpenContext {
+    const AVClass *class;
+
+    NPPSharpenStageContext stages[STAGE_NB];
+    AVFrame *tmp_frame;
+
+    /**
+     * Output sw format. AV_PIX_FMT_NONE for no conversion.
+     */
+    enum AVPixelFormat format;
+
+    /**
+     * Width, height and pixel format strings;
+     */
+    char *w_expr;
+    char *h_expr;
+    char *format_str;    
+
+} NPPSharpenContext;
+
+static int nppsharpen_init(AVFilterContext *ctx)
+{
+    NPPSharpenContext *s = ctx->priv;
+    int i;
+
+    if (!strcmp(s->format_str, "same")) {
+        s->format = AV_PIX_FMT_NONE;
+    } else {
+        s->format = av_get_pix_fmt(s->format_str);
+        if (s->format == AV_PIX_FMT_NONE) {
+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
+        s->stages[i].frame = av_frame_alloc();
+        if (!s->stages[i].frame)
+            return AVERROR(ENOMEM);
+    }
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static void nppsharpen_uninit(AVFilterContext *ctx)
+{
+    NPPSharpenContext              *s = ctx->priv;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
+        av_frame_free(&s->stages[i].frame);
+        av_buffer_unref(&s->stages[i].frames_ctx);
+    }
+    av_frame_free(&s->tmp_frame);
+}
+
+static int nppsharpen_query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pixel_formats[] = {
+        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
+    };
+    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
+
+    return ff_set_common_formats(ctx, pix_fmts);
+}
+
+static int init_stage(NPPSharpenStageContext *stage, AVBufferRef *device_ctx)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int in_sw, in_sh, out_sw, out_sh;
+    int ret, i;
+
+    av_pix_fmt_get_chroma_sub_sample(stage->in_fmt,  &in_sw,  &in_sh);
+    av_pix_fmt_get_chroma_sub_sample(stage->out_fmt, &out_sw, &out_sh);
+    if (!stage->planes_out[0].width) {
+        stage->planes_out[0].width  = stage->planes_in[0].width;
+        stage->planes_out[0].height = stage->planes_in[0].height;
+    }
+
+    for (i = 1; i < FF_ARRAY_ELEMS(stage->planes_in); i++) {
+        stage->planes_in[i].width   = stage->planes_in[0].width   >> in_sw;
+        stage->planes_in[i].height  = stage->planes_in[0].height  >> in_sh;
+        stage->planes_out[i].width  = stage->planes_out[0].width  >> out_sw;
+        stage->planes_out[i].height = stage->planes_out[0].height >> out_sh;
+    }
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format    = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = stage->out_fmt;
+    out_ctx->width     = FFALIGN(stage->planes_out[0].width,  32);
+    out_ctx->height    = FFALIGN(stage->planes_out[0].height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(stage->frame);
+    ret = av_hwframe_get_buffer(out_ref, stage->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    stage->frame->width  = stage->planes_out[0].width;
+    stage->frame->height = stage->planes_out[0].height;
+
+    av_buffer_unref(&stage->frames_ctx);
+    stage->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static enum AVPixelFormat get_deinterleaved_format(enum AVPixelFormat fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+    int i, planes;
+
+    planes = av_pix_fmt_count_planes(fmt);
+    if (planes == desc->nb_components)
+        return fmt;
+    for (i = 0; i < FF_ARRAY_ELEMS(deinterleaved_formats); i++)
+        if (deinterleaved_formats[i][0] == fmt)
+            return deinterleaved_formats[i][1];
+    return AV_PIX_FMT_NONE;
+}
+
+static int init_processing_chain(AVFilterContext *ctx, int width, int height)
+{
+    NPPSharpenContext *s = ctx->priv;
+    AVHWFramesContext *in_frames_ctx;
+
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    enum AVPixelFormat in_deinterleaved_format;
+    enum AVPixelFormat out_deinterleaved_format;
+
+    int i, ret, last_stage = -1;
+
+    /* check that we have a hw context */
+    if (!ctx->inputs[0]->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
+    in_format     = in_frames_ctx->sw_format;
+    out_format    = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    in_deinterleaved_format  = get_deinterleaved_format(in_format);
+    out_deinterleaved_format = get_deinterleaved_format(out_format);
+    if (in_deinterleaved_format  == AV_PIX_FMT_NONE ||
+        out_deinterleaved_format == AV_PIX_FMT_NONE)
+        return AVERROR_BUG;
+
+    /* figure out which stages need to be done */
+    s->stages[STAGE_SHARPEN].stage_needed = 1;
+    if (in_format != in_deinterleaved_format)
+        s->stages[STAGE_DEINTERLEAVE].stage_needed = 1;
+    if (out_format != out_deinterleaved_format)
+        s->stages[STAGE_INTERLEAVE].stage_needed = 1;
+
+    s->stages[STAGE_DEINTERLEAVE].in_fmt              = in_format;
+    s->stages[STAGE_DEINTERLEAVE].out_fmt             = in_deinterleaved_format;
+    s->stages[STAGE_DEINTERLEAVE].planes_in[0].width  = width;
+    s->stages[STAGE_DEINTERLEAVE].planes_in[0].height = height;
+
+    s->stages[STAGE_SHARPEN].in_fmt               = in_deinterleaved_format;
+    s->stages[STAGE_SHARPEN].out_fmt              = out_deinterleaved_format;
+    s->stages[STAGE_SHARPEN].planes_in[0].width   = width;
+    s->stages[STAGE_SHARPEN].planes_in[0].height  = height;
+    s->stages[STAGE_SHARPEN].planes_out[0].width  = width;
+    s->stages[STAGE_SHARPEN].planes_out[0].height = height;
+
+    s->stages[STAGE_INTERLEAVE].in_fmt              = out_deinterleaved_format;
+    s->stages[STAGE_INTERLEAVE].out_fmt             = out_format;
+    s->stages[STAGE_INTERLEAVE].planes_in[0].width  = width;
+    s->stages[STAGE_INTERLEAVE].planes_in[0].height = height;
+
+    /* init the hardware contexts */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
+        if (!s->stages[i].stage_needed)
+            continue;
+
+        ret = init_stage(&s->stages[i], in_frames_ctx->device_ref);
+        if (ret < 0)
+            return ret;
+
+        last_stage = i;
+    }
+
+    if (last_stage >= 0)
+        ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->stages[last_stage].frames_ctx);
+    else
+        ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(ctx->inputs[0]->hw_frames_ctx);
+
+    if (!ctx->outputs[0]->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static int nppsharpen_config_props(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    int ret;
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+
+    ret = init_processing_chain(ctx, inlink->w, inlink->h);
+    if (ret < 0)
+        return ret;
+
+    if (inlink->sample_aspect_ratio.num)
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
+                                                             outlink->w*inlink->h},
+                                                inlink->sample_aspect_ratio);
+    else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+    return ret;
+}
+
+static int nppsharpen_deinterleave(AVFilterContext *ctx, NPPSharpenStageContext *stage,
+                                 AVFrame *out, AVFrame *in)
+{
+    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data;
+    NppStatus err;
+
+    switch (in_frames_ctx->sw_format) {
+    case AV_PIX_FMT_NV12:
+        err = nppiYCbCr420_8u_P2P3R(in->data[0], in->linesize[0],
+                                    in->data[1], in->linesize[1],
+                                    out->data, out->linesize,
+                                    (NppiSize){ in->width, in->height });
+        break;
+    default:
+        return AVERROR_BUG;
+    }
+    if (err != NPP_SUCCESS) {
+        av_log(ctx, AV_LOG_ERROR, "NPP deinterleave error: %d\n", err);
+        return AVERROR_UNKNOWN;
+    }
+
+    return 0;
+}
+
+static int nppsharpen_apply_filter(AVFilterContext *ctx, NPPSharpenStageContext *stage,
+                           AVFrame *out, AVFrame *in)
+{
+    NppStatus err;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) {
+        int ow = stage->planes_out[i].width;
+        int oh = stage->planes_out[i].height;
+
+        err = nppiFilterSharpenBorder_8u_C1R(in->data[i], in->linesize[i], 
+                                             (NppiSize){ow, oh}, (NppiPoint){0, 0},
+                                             out->data[i], out->linesize[i], 
+                                             (NppiSize){ow, oh},
+                                             NPP_BORDER_REPLICATE);
+        if (err != NPP_SUCCESS) {
+            av_log(ctx, AV_LOG_ERROR, "NPP sharpen error: %d\n", err);
+            return AVERROR_UNKNOWN;
+        }
+    }
+
+    return 0;
+}
+
+static int nppsharpen_interleave(AVFilterContext *ctx, NPPSharpenStageContext *stage,
+                               AVFrame *out, AVFrame *in)
+{
+    AVHWFramesContext *out_frames_ctx = (AVHWFramesContext*)out->hw_frames_ctx->data;
+    NppStatus err;
+
+    switch (out_frames_ctx->sw_format) {
+    case AV_PIX_FMT_NV12:
+        err = nppiYCbCr420_8u_P3P2R((const uint8_t**)in->data,
+                                    in->linesize,
+                                    out->data[0], out->linesize[0],
+                                    out->data[1], out->linesize[1],
+                                    (NppiSize){ in->width, in->height });
+        break;
+    default:
+        return AVERROR_BUG;
+    }
+    if (err != NPP_SUCCESS) {
+        av_log(ctx, AV_LOG_ERROR, "NPP deinterleave error: %d\n", err);
+        return AVERROR_UNKNOWN;
+    }
+
+    return 0;
+}
+
+static int (*const nppsharpen_process[])(AVFilterContext *ctx, NPPSharpenStageContext *stage,
+                                       AVFrame *out, AVFrame *in) = {
+    [STAGE_DEINTERLEAVE] = nppsharpen_deinterleave,
+    [STAGE_SHARPEN]      = nppsharpen_apply_filter,
+    [STAGE_INTERLEAVE]   = nppsharpen_interleave,
+};
+
+static int nppsharpen_sharpen(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
+{
+    NPPSharpenContext *s = ctx->priv;
+    AVFrame *src = in;
+    int i, ret, last_stage = -1;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->stages); i++) {
+        if (!s->stages[i].stage_needed)
+            continue;
+
+        ret = nppsharpen_process[i](ctx, &s->stages[i], s->stages[i].frame, src);
+        if (ret < 0)
+            return ret;
+
+        src        = s->stages[i].frame;
+        last_stage = i;
+    }
+
+    if (last_stage < 0)
+        return AVERROR_BUG;
+    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
+    if (ret < 0)
+        return ret;
+
+    av_frame_move_ref(out, src);
+    av_frame_move_ref(src, s->tmp_frame);
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int nppsharpen_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext              *ctx = link->dst;
+    AVFilterLink             *outlink = ctx->outputs[0];
+    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)outlink->hw_frames_ctx->data;
+    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(device_hwctx->internal->cuda_dl->cuCtxPushCurrent(device_hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = nppsharpen_sharpen(ctx, out, in);
+
+    CHECK_CU(device_hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
+              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
+              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
+              INT_MAX);
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+#define OFFSET(x) offsetof(NPPSharpenContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption options[] = {
+    { "format",     "Output pixel format.", OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
+    { NULL },
+};
+
+static const AVClass nppsharpen_class = {
+    .class_name = "nppsharpen",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad nppsharpen_inputs[] = {
+    {
+        .name        = "default",
+        .type        = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = nppsharpen_filter_frame,
+    }
+};
+
+static const AVFilterPad nppsharpen_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = nppsharpen_config_props,
+    }
+};
+
+const AVFilter ff_vf_sharpen_npp = {
+    .name      = "sharpen_npp",
+    .description = NULL_IF_CONFIG_SMALL("NVIDIA Performance Primitives video "
+                                        "sharpening filter."),
+
+    .init          = nppsharpen_init,
+    .uninit        = nppsharpen_uninit,
+    .query_formats = nppsharpen_query_formats,
+
+    .priv_size = sizeof(NPPSharpenContext),
+    .priv_class = &nppsharpen_class,
+
+    FILTER_INPUTS(nppsharpen_inputs),
+    FILTER_OUTPUTS(nppsharpen_outputs),
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};