diff mbox series

[FFmpeg-devel,2/3] scale_cuda frame crop support

Message ID 20240910181057.43453-2-koushd@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/3] scale_vt frame crop support | expand

Checks

Context Check Description
andriy/commit_msg_x86 warning The first line of the commit message must start with a context terminated by a colon and a space, for example "lavu/opt: " or "doc: ".
yinshiyou/commit_msg_loongarch64 warning The first line of the commit message must start with a context terminated by a colon and a space, for example "lavu/opt: " or "doc: ".
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Koushik Dutta Sept. 10, 2024, 6:10 p.m. UTC
The crop filter has no effect on scale_cuda:

-vf crop=100:100,scale_cuda=300x300

Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
as seen in the implementation vf_crop.c.

The current workaround is to hwdownload the full frame
and perform the crop on CPU.
---
 libavfilter/vf_scale_cuda.c  | 15 ++++++++++-----
 libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
 2 files changed, 24 insertions(+), 13 deletions(-)

Comments

Timo Rothenpieler Sept. 10, 2024, 6:37 p.m. UTC | #1
On 10.09.2024 20:10, Koushik Dutta wrote:
> The crop filter has no effect on scale_cuda:
> 
> -vf crop=100:100,scale_cuda=300x300
> 
> Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
> as seen in the implementation vf_crop.c.
> 
> The current workaround is to hwdownload the full frame
> and perform the crop on CPU.
> ---
>   libavfilter/vf_scale_cuda.c  | 15 ++++++++++-----
>   libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
>   2 files changed, 24 insertions(+), 13 deletions(-)
> 
> diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
> index 54a340949d..eb8beee771 100644
> --- a/libavfilter/vf_scale_cuda.c
> +++ b/libavfilter/vf_scale_cuda.c
> @@ -407,7 +407,7 @@ fail:
>   }
>   
>   static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
> -                              CUtexObject src_tex[4], int src_width, int src_height,
> +                              CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
>                                 AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
>   {
>       CUDAScaleContext *s = ctx->priv;
> @@ -422,7 +422,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
>           &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
>           &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
>           &dst_width, &dst_height, &dst_pitch,
> -        &src_width, &src_height, &s->param
> +        &src_left, &src_top, &src_width, &src_height, &s->param
>       };
>   
>       return CHECK_CU(cu->cuLaunchKernel(func,
> @@ -440,6 +440,9 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       CUtexObject tex[4] = { 0, 0, 0, 0 };
>   
> +    int crop_width = (in->width - in->crop_right) - in->crop_left;
> +    int crop_height = (in->height - in->crop_bottom) - in->crop_top;

Bit of a nit, but I don't think the parenthesis are neccesary.

> +
>       ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
>       if (ret < 0)
>           return ret;
> @@ -477,7 +480,7 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
>       ret = call_resize_kernel(ctx, s->cu_func,
> -                             tex, in->width, in->height,
> +                             tex, in->crop_left, in->crop_top, crop_width, crop_height,
>                                out, out->width, out->height, out->linesize[0]);
>       if (ret < 0)
>           goto exit;
> @@ -485,8 +488,10 @@ static int scalecuda_resize(AVFilterContext *ctx,
>       if (s->out_planes > 1) {
>           // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
>           ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
> -                                 AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
> -                                 AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
>                                    out,
>                                    AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
>                                    AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
> diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
> index de06ba9433..271b55cd5d 100644
> --- a/libavfilter/vf_scale_cuda.cu
> +++ b/libavfilter/vf_scale_cuda.cu
> @@ -26,6 +26,7 @@
>   template<typename T>
>   using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
>                                      int dst_width, int dst_height,
> +                                   int src_left, int src_top,
>                                      int src_width, int src_height,
>                                      int bit_depth, float param);
>   
> @@ -64,11 +65,12 @@ static inline __device__ ushort conv_16to10(ushort in)
>                subsample_function_t<in_T_uv> subsample_func_uv>                                  \
>       __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
>                                       int dst_width, int dst_height, int dst_pitch,              \
> -                                    int src_width, int src_height, float param)
> +                                    int src_left, int src_top, int src_width, int src_height, float param)
>   
>   #define SUB_F(m, plane) \
>       subsample_func_##m(src_tex[plane], xo, yo, \
>                          dst_width, dst_height,  \
> +                       src_left, src_top,      \
>                          src_width, src_height,  \
>                          in_bit_depth, param)
>   
> @@ -1063,13 +1065,14 @@ template<typename T>
>   __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>   
>       return tex2D<T>(tex, xi, yi);
>   }
> @@ -1078,13 +1081,14 @@ template<typename T>
>   __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
>                                                 int xo, int yo,
>                                                 int dst_width, int dst_height,
> +                                              int src_left, int src_top,
>                                                 int src_width, int src_height,
>                                                 int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>       // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
>       float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
>       float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
> @@ -1109,13 +1113,14 @@ template<typename T, coeffs_function_t coeffs_function>
>   __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale - 0.5f;
> -    float yi = (yo + 0.5f) * vscale - 0.5f;
> +    float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
> +    float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
>       float px = floor(xi);
>       float py = floor(yi);
>       float fx = xi - px;
> @@ -1147,7 +1152,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
>       T *dst_0, T *dst_1, T *dst_2, T *dst_3,                       \
>       int dst_width, int dst_height, int dst_pitch,                 \
> -    int src_width, int src_height, float param
> +    int src_left, int src_top, int src_width, int src_height, float param
>   
>   #define SUBSAMPLE(Convert, T) \
>       cudaTextureObject_t src_tex[4] =                    \
> @@ -1159,6 +1164,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       Convert(                                            \
>           src_tex, dst, xo, yo,                           \
>           dst_width, dst_height, dst_pitch,               \
> +        src_left, src_top,                              \
>           src_width, src_height, param);
>   
>   extern "C" {

Looks good to me otherwise, will give it a test later.
Timo Rothenpieler Oct. 6, 2024, 7:09 p.m. UTC | #2
On 10.09.2024 20:10, Koushik Dutta wrote:
> The crop filter has no effect on scale_cuda:
> 
> -vf crop=100:100,scale_cuda=300x300
> 
> Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
> as seen in the implementation vf_crop.c.
> 
> The current workaround is to hwdownload the full frame
> and perform the crop on CPU.
> ---
>   libavfilter/vf_scale_cuda.c  | 15 ++++++++++-----
>   libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
>   2 files changed, 24 insertions(+), 13 deletions(-)
> 
> diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
> index 54a340949d..eb8beee771 100644
> --- a/libavfilter/vf_scale_cuda.c
> +++ b/libavfilter/vf_scale_cuda.c
> @@ -407,7 +407,7 @@ fail:
>   }
>   
>   static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
> -                              CUtexObject src_tex[4], int src_width, int src_height,
> +                              CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
>                                 AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
>   {
>       CUDAScaleContext *s = ctx->priv;
> @@ -422,7 +422,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
>           &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
>           &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
>           &dst_width, &dst_height, &dst_pitch,
> -        &src_width, &src_height, &s->param
> +        &src_left, &src_top, &src_width, &src_height, &s->param
>       };
>   
>       return CHECK_CU(cu->cuLaunchKernel(func,
> @@ -440,6 +440,9 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       CUtexObject tex[4] = { 0, 0, 0, 0 };
>   
> +    int crop_width = (in->width - in->crop_right) - in->crop_left;
> +    int crop_height = (in->height - in->crop_bottom) - in->crop_top;
> +
>       ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
>       if (ret < 0)
>           return ret;
> @@ -477,7 +480,7 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
>       ret = call_resize_kernel(ctx, s->cu_func,
> -                             tex, in->width, in->height,
> +                             tex, in->crop_left, in->crop_top, crop_width, crop_height,
>                                out, out->width, out->height, out->linesize[0]);
>       if (ret < 0)
>           goto exit;
> @@ -485,8 +488,10 @@ static int scalecuda_resize(AVFilterContext *ctx,
>       if (s->out_planes > 1) {
>           // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
>           ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
> -                                 AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
> -                                 AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
>                                    out,
>                                    AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
>                                    AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
> diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
> index de06ba9433..271b55cd5d 100644
> --- a/libavfilter/vf_scale_cuda.cu
> +++ b/libavfilter/vf_scale_cuda.cu
> @@ -26,6 +26,7 @@
>   template<typename T>
>   using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
>                                      int dst_width, int dst_height,
> +                                   int src_left, int src_top,
>                                      int src_width, int src_height,
>                                      int bit_depth, float param);
>   
> @@ -64,11 +65,12 @@ static inline __device__ ushort conv_16to10(ushort in)
>                subsample_function_t<in_T_uv> subsample_func_uv>                                  \
>       __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
>                                       int dst_width, int dst_height, int dst_pitch,              \
> -                                    int src_width, int src_height, float param)
> +                                    int src_left, int src_top, int src_width, int src_height, float param)
>   
>   #define SUB_F(m, plane) \
>       subsample_func_##m(src_tex[plane], xo, yo, \
>                          dst_width, dst_height,  \
> +                       src_left, src_top,      \
>                          src_width, src_height,  \
>                          in_bit_depth, param)
>   
> @@ -1063,13 +1065,14 @@ template<typename T>
>   __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>   
>       return tex2D<T>(tex, xi, yi);
>   }
> @@ -1078,13 +1081,14 @@ template<typename T>
>   __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
>                                                 int xo, int yo,
>                                                 int dst_width, int dst_height,
> +                                              int src_left, int src_top,
>                                                 int src_width, int src_height,
>                                                 int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>       // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
>       float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
>       float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
> @@ -1109,13 +1113,14 @@ template<typename T, coeffs_function_t coeffs_function>
>   __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale - 0.5f;
> -    float yi = (yo + 0.5f) * vscale - 0.5f;
> +    float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
> +    float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
>       float px = floor(xi);
>       float py = floor(yi);
>       float fx = xi - px;
> @@ -1147,7 +1152,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
>       T *dst_0, T *dst_1, T *dst_2, T *dst_3,                       \
>       int dst_width, int dst_height, int dst_pitch,                 \
> -    int src_width, int src_height, float param
> +    int src_left, int src_top, int src_width, int src_height, float param
>   
>   #define SUBSAMPLE(Convert, T) \
>       cudaTextureObject_t src_tex[4] =                    \
> @@ -1159,6 +1164,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       Convert(                                            \
>           src_tex, dst, xo, yo,                           \
>           dst_width, dst_height, dst_pitch,               \
> +        src_left, src_top,                              \
>           src_width, src_height, param);
>   
>   extern "C" {

One problem I noticed with this is that if no scaling operation is 
performed, the filter will go into passthrough mode, and ignore the 
cropping info.

Also, when manually forcing passthrough to off, it will crop, but then 
scale back up to the original size.
Is this the intended behaviour for a cropping hw-scale-filter? It does 
not match the behaviour of the sw crop filter.

I'd expect the output dimensions to be that of the cropped frame, 
without any scaling applied to it, if no further parameters are provided.
But it's also non-trivial to achieve that, since at the time the filter 
determined all that info is at init time, where no info about cropping 
is available.
diff mbox series

Patch

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 54a340949d..eb8beee771 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -407,7 +407,7 @@  fail:
 }
 
 static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
-                              CUtexObject src_tex[4], int src_width, int src_height,
+                              CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
                               AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -422,7 +422,7 @@  static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
         &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
         &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
         &dst_width, &dst_height, &dst_pitch,
-        &src_width, &src_height, &s->param
+        &src_left, &src_top, &src_width, &src_height, &s->param
     };
 
     return CHECK_CU(cu->cuLaunchKernel(func,
@@ -440,6 +440,9 @@  static int scalecuda_resize(AVFilterContext *ctx,
 
     CUtexObject tex[4] = { 0, 0, 0, 0 };
 
+    int crop_width = (in->width - in->crop_right) - in->crop_left;
+    int crop_height = (in->height - in->crop_bottom) - in->crop_top;
+
     ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
     if (ret < 0)
         return ret;
@@ -477,7 +480,7 @@  static int scalecuda_resize(AVFilterContext *ctx,
 
     // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
     ret = call_resize_kernel(ctx, s->cu_func,
-                             tex, in->width, in->height,
+                             tex, in->crop_left, in->crop_top, crop_width, crop_height,
                              out, out->width, out->height, out->linesize[0]);
     if (ret < 0)
         goto exit;
@@ -485,8 +488,10 @@  static int scalecuda_resize(AVFilterContext *ctx,
     if (s->out_planes > 1) {
         // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
         ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
-                                 AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
                                  out,
                                  AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
                                  AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index de06ba9433..271b55cd5d 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -26,6 +26,7 @@ 
 template<typename T>
 using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
                                    int dst_width, int dst_height,
+                                   int src_left, int src_top,
                                    int src_width, int src_height,
                                    int bit_depth, float param);
 
@@ -64,11 +65,12 @@  static inline __device__ ushort conv_16to10(ushort in)
              subsample_function_t<in_T_uv> subsample_func_uv>                                  \
     __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
                                     int dst_width, int dst_height, int dst_pitch,              \
-                                    int src_width, int src_height, float param)
+                                    int src_left, int src_top, int src_width, int src_height, float param)
 
 #define SUB_F(m, plane) \
     subsample_func_##m(src_tex[plane], xo, yo, \
                        dst_width, dst_height,  \
+                       src_left, src_top,      \
                        src_width, src_height,  \
                        in_bit_depth, param)
 
@@ -1063,13 +1065,14 @@  template<typename T>
 __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
                                              int xo, int yo,
                                              int dst_width, int dst_height,
+                                             int src_left, int src_top,
                                              int src_width, int src_height,
                                              int bit_depth, float param)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
-    float xi = (xo + 0.5f) * hscale;
-    float yi = (yo + 0.5f) * vscale;
+    float xi = (xo + 0.5f) * hscale + src_left;
+    float yi = (yo + 0.5f) * vscale + src_top;
 
     return tex2D<T>(tex, xi, yi);
 }
@@ -1078,13 +1081,14 @@  template<typename T>
 __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
                                               int xo, int yo,
                                               int dst_width, int dst_height,
+                                              int src_left, int src_top,
                                               int src_width, int src_height,
                                               int bit_depth, float param)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
-    float xi = (xo + 0.5f) * hscale;
-    float yi = (yo + 0.5f) * vscale;
+    float xi = (xo + 0.5f) * hscale + src_left;
+    float yi = (yo + 0.5f) * vscale + src_top;
     // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
     float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
     float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
@@ -1109,13 +1113,14 @@  template<typename T, coeffs_function_t coeffs_function>
 __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
                                              int xo, int yo,
                                              int dst_width, int dst_height,
+                                             int src_left, int src_top,
                                              int src_width, int src_height,
                                              int bit_depth, float param)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
-    float xi = (xo + 0.5f) * hscale - 0.5f;
-    float yi = (yo + 0.5f) * vscale - 0.5f;
+    float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
+    float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
     float px = floor(xi);
     float py = floor(yi);
     float fx = xi - px;
@@ -1147,7 +1152,7 @@  __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
     cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
     T *dst_0, T *dst_1, T *dst_2, T *dst_3,                       \
     int dst_width, int dst_height, int dst_pitch,                 \
-    int src_width, int src_height, float param
+    int src_left, int src_top, int src_width, int src_height, float param
 
 #define SUBSAMPLE(Convert, T) \
     cudaTextureObject_t src_tex[4] =                    \
@@ -1159,6 +1164,7 @@  __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
     Convert(                                            \
         src_tex, dst, xo, yo,                           \
         dst_width, dst_height, dst_pitch,               \
+        src_left, src_top,                              \
         src_width, src_height, param);
 
 extern "C" {