[FFmpeg-devel,2/4] scale_cuda frame crop support

Message ID	20241018230552.80169-2-koushd@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: Koushik Dutta <koushd@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Fri, 18 Oct 2024 16:05:50 -0700 Message-Id: <20241018230552.80169-2-koushd@gmail.com> In-Reply-To: <20241018230552.80169-1-koushd@gmail.com> References: <20241018230552.80169-1-koushd@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/4] scale_cuda frame crop support Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Koushik Dutta <koushd@gmail.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,1/4] scale_vt frame crop support \| expand [FFmpeg-devel,1/4] scale_vt frame crop support [FFmpeg-devel,2/4] scale_cuda frame crop support [FFmpeg-devel,3/4] scale_qsv frame crop support [FFmpeg-devel,4/4] scale_vt implementation for negative width/height sizing

Message ID

20241018230552.80169-2-koushd@gmail.com

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
From: Koushik Dutta <koushd@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Fri, 18 Oct 2024 16:05:50 -0700
Message-Id: <20241018230552.80169-2-koushd@gmail.com>
In-Reply-To: <20241018230552.80169-1-koushd@gmail.com>
References: <20241018230552.80169-1-koushd@gmail.com>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 2/4] scale_cuda frame crop support
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Koushik Dutta <koushd@gmail.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel,1/4] scale_vt frame crop support | expand

Commit Message

Koushik Dutta Oct. 18, 2024, 11:05 p.m. UTC

The crop filter has no effect on scale_cuda:

-vf crop=100:100,scale_cuda=300x300

Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
as seen in the implementation vf_crop.c.

The current workaround is to hwdownload the full frame
and perform the crop on CPU.
---
 libavfilter/vf_scale_cuda.c  | 17 +++++++++++------
 libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
 2 files changed, 25 insertions(+), 14 deletions(-)

Comments

Timo Rothenpieler Oct. 19, 2024, 5:48 p.m. UTC | #1

The commit message doesn't follow the standard format.
Should be avfilter/scale_cuda: or lavfi/scale_cuda:

On 19.10.2024 01:05, Koushik Dutta wrote:
> The crop filter has no effect on scale_cuda:
> 
> -vf crop=100:100,scale_cuda=300x300
> 
> Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
> as seen in the implementation vf_crop.c.
> 
> The current workaround is to hwdownload the full frame
> and perform the crop on CPU.
> ---
>   libavfilter/vf_scale_cuda.c  | 17 +++++++++++------
>   libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
>   2 files changed, 25 insertions(+), 14 deletions(-)
> 
> diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
> index 54a340949d..8615da308a 100644
> --- a/libavfilter/vf_scale_cuda.c
> +++ b/libavfilter/vf_scale_cuda.c
> @@ -407,7 +407,7 @@ fail:
>   }
>   
>   static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
> -                              CUtexObject src_tex[4], int src_width, int src_height,
> +                              CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
>                                 AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
>   {
>       CUDAScaleContext *s = ctx->priv;
> @@ -422,7 +422,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
>           &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
>           &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
>           &dst_width, &dst_height, &dst_pitch,
> -        &src_width, &src_height, &s->param
> +        &src_left, &src_top, &src_width, &src_height, &s->param
>       };
>   
>       return CHECK_CU(cu->cuLaunchKernel(func,
> @@ -440,6 +440,9 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       CUtexObject tex[4] = { 0, 0, 0, 0 };
>   
> +    int crop_width = (in->width - in->crop_right) - in->crop_left;
> +    int crop_height = (in->height - in->crop_bottom) - in->crop_top;
> +
>       ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
>       if (ret < 0)
>           return ret;
> @@ -477,7 +480,7 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
>       ret = call_resize_kernel(ctx, s->cu_func,
> -                             tex, in->width, in->height,
> +                             tex, in->crop_left, in->crop_top, crop_width, crop_height,
>                                out, out->width, out->height, out->linesize[0]);
>       if (ret < 0)
>           goto exit;
> @@ -485,8 +488,10 @@ static int scalecuda_resize(AVFilterContext *ctx,
>       if (s->out_planes > 1) {
>           // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
>           ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
> -                                 AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
> -                                 AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
>                                    out,
>                                    AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
>                                    AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
> @@ -545,7 +550,7 @@ static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
>       CUcontext dummy;
>       int ret = 0;
>   
> -    if (s->passthrough)
> +    if (s->passthrough && !in->crop_left && !in->crop_top && !in->crop_right && !in->crop_bottom)

The problem with this is, if the passthrough flag is set, the frame 
allocation in cudascale_get_video_buffer() is modified.
It might not be safe to proceed with filtering, which requires 
allocating a frame, when the passthrough flag wasn't already unset 
before entering cudascale_filter_frame().

I tried looking into this quite a bit when initially reviewing this, but 
I could not make sense of the implications of the 
cudascale_get_video_buffer() function, like what uses it when. Or if 
it's used at all, given the filter allocates frames itself from the 
hwframesctx, etc...

So if someone with wider knowledge of the avfilter internals there could 
have a look, that'd be great.

Otherwise I'd apply this in its previous version, which has annoying 
behaviour, but at least is known to be safe.

>           return ff_filter_frame(outlink, in);
>   
>       out = av_frame_alloc();
> diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
> index de06ba9433..271b55cd5d 100644
> --- a/libavfilter/vf_scale_cuda.cu
> +++ b/libavfilter/vf_scale_cuda.cu
> @@ -26,6 +26,7 @@
>   template<typename T>
>   using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
>                                      int dst_width, int dst_height,
> +                                   int src_left, int src_top,
>                                      int src_width, int src_height,
>                                      int bit_depth, float param);
>   
> @@ -64,11 +65,12 @@ static inline __device__ ushort conv_16to10(ushort in)
>                subsample_function_t<in_T_uv> subsample_func_uv>                                  \
>       __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
>                                       int dst_width, int dst_height, int dst_pitch,              \
> -                                    int src_width, int src_height, float param)
> +                                    int src_left, int src_top, int src_width, int src_height, float param)
>   
>   #define SUB_F(m, plane) \
>       subsample_func_##m(src_tex[plane], xo, yo, \
>                          dst_width, dst_height,  \
> +                       src_left, src_top,      \
>                          src_width, src_height,  \
>                          in_bit_depth, param)
>   
> @@ -1063,13 +1065,14 @@ template<typename T>
>   __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>   
>       return tex2D<T>(tex, xi, yi);
>   }
> @@ -1078,13 +1081,14 @@ template<typename T>
>   __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
>                                                 int xo, int yo,
>                                                 int dst_width, int dst_height,
> +                                              int src_left, int src_top,
>                                                 int src_width, int src_height,
>                                                 int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>       // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
>       float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
>       float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
> @@ -1109,13 +1113,14 @@ template<typename T, coeffs_function_t coeffs_function>
>   __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale - 0.5f;
> -    float yi = (yo + 0.5f) * vscale - 0.5f;
> +    float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
> +    float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
>       float px = floor(xi);
>       float py = floor(yi);
>       float fx = xi - px;
> @@ -1147,7 +1152,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
>       T *dst_0, T *dst_1, T *dst_2, T *dst_3,                       \
>       int dst_width, int dst_height, int dst_pitch,                 \
> -    int src_width, int src_height, float param
> +    int src_left, int src_top, int src_width, int src_height, float param
>   
>   #define SUBSAMPLE(Convert, T) \
>       cudaTextureObject_t src_tex[4] =                    \
> @@ -1159,6 +1164,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       Convert(                                            \
>           src_tex, dst, xo, yo,                           \
>           dst_width, dst_height, dst_pitch,               \
> +        src_left, src_top,                              \
>           src_width, src_height, param);
>   
>   extern "C" {

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 54a340949d..8615da308a 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -407,7 +407,7 @@  fail:
 }
 
 static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
-                              CUtexObject src_tex[4], int src_width, int src_height,
+                              CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
                               AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -422,7 +422,7 @@  static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
         &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
         &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
         &dst_width, &dst_height, &dst_pitch,
-        &src_width, &src_height, &s->param
+        &src_left, &src_top, &src_width, &src_height, &s->param
     };
 
     return CHECK_CU(cu->cuLaunchKernel(func,
@@ -440,6 +440,9 @@  static int scalecuda_resize(AVFilterContext *ctx,
 
     CUtexObject tex[4] = { 0, 0, 0, 0 };
 
+    int crop_width = (in->width - in->crop_right) - in->crop_left;
+    int crop_height = (in->height - in->crop_bottom) - in->crop_top;
+
     ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
     if (ret < 0)
         return ret;
@@ -477,7 +480,7 @@  static int scalecuda_resize(AVFilterContext *ctx,
 
     // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
     ret = call_resize_kernel(ctx, s->cu_func,
-                             tex, in->width, in->height,
+                             tex, in->crop_left, in->crop_top, crop_width, crop_height,
                              out, out->width, out->height, out->linesize[0]);
     if (ret < 0)
         goto exit;
@@ -485,8 +488,10 @@  static int scalecuda_resize(AVFilterContext *ctx,
     if (s->out_planes > 1) {
         // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
         ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
-                                 AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
                                  out,
                                  AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
                                  AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
@@ -545,7 +550,7 @@  static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
     CUcontext dummy;
     int ret = 0;
 
-    if (s->passthrough)
+    if (s->passthrough && !in->crop_left && !in->crop_top && !in->crop_right && !in->crop_bottom)
         return ff_filter_frame(outlink, in);
 
     out = av_frame_alloc();
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index de06ba9433..271b55cd5d 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -26,6 +26,7 @@ 
 template<typename T>
 using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
                                    int dst_width, int dst_height,
+                                   int src_left, int src_top,
                                    int src_width, int src_height,
                                    int bit_depth, float param);
 
@@ -64,11 +65,12 @@  static inline __device__ ushort conv_16to10(ushort in)
              subsample_function_t<in_T_uv> subsample_func_uv>                                  \
     __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
                                     int dst_width, int dst_height, int dst_pitch,              \
-                                    int src_width, int src_height, float param)
+                                    int src_left, int src_top, int src_width, int src_height, float param)
 
 #define SUB_F(m, plane) \
     subsample_func_##m(src_tex[plane], xo, yo, \
                        dst_width, dst_height,  \
+                       src_left, src_top,      \
                        src_width, src_height,  \
                        in_bit_depth, param)
 
@@ -1063,13 +1065,14 @@  template<typename T>
 __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
                                              int xo, int yo,
                                              int dst_width, int dst_height,
+                                             int src_left, int src_top,
                                              int src_width, int src_height,
                                              int bit_depth, float param)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
-    float xi = (xo + 0.5f) * hscale;
-    float yi = (yo + 0.5f) * vscale;
+    float xi = (xo + 0.5f) * hscale + src_left;
+    float yi = (yo + 0.5f) * vscale + src_top;
 
     return tex2D<T>(tex, xi, yi);
 }
@@ -1078,13 +1081,14 @@  template<typename T>
 __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
                                               int xo, int yo,
                                               int dst_width, int dst_height,
+                                              int src_left, int src_top,
                                               int src_width, int src_height,
                                               int bit_depth, float param)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
-    float xi = (xo + 0.5f) * hscale;
-    float yi = (yo + 0.5f) * vscale;
+    float xi = (xo + 0.5f) * hscale + src_left;
+    float yi = (yo + 0.5f) * vscale + src_top;
     // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
     float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
     float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
@@ -1109,13 +1113,14 @@  template<typename T, coeffs_function_t coeffs_function>
 __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
                                              int xo, int yo,
                                              int dst_width, int dst_height,
+                                             int src_left, int src_top,
                                              int src_width, int src_height,
                                              int bit_depth, float param)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
-    float xi = (xo + 0.5f) * hscale - 0.5f;
-    float yi = (yo + 0.5f) * vscale - 0.5f;
+    float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
+    float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
     float px = floor(xi);
     float py = floor(yi);
     float fx = xi - px;
@@ -1147,7 +1152,7 @@  __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
     cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
     T *dst_0, T *dst_1, T *dst_2, T *dst_3,                       \
     int dst_width, int dst_height, int dst_pitch,                 \
-    int src_width, int src_height, float param
+    int src_left, int src_top, int src_width, int src_height, float param
 
 #define SUBSAMPLE(Convert, T) \
     cudaTextureObject_t src_tex[4] =                    \
@@ -1159,6 +1164,7 @@  __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
     Convert(                                            \
         src_tex, dst, xo, yo,                           \
         dst_width, dst_height, dst_pitch,               \
+        src_left, src_top,                              \
         src_width, src_height, param);
 
 extern "C" {

[FFmpeg-devel,2/4] scale_cuda frame crop support

Commit Message

Comments

Patch