diff mbox series

[FFmpeg-devel,V2,3/3] avfilter/vf_dnn_processing.c: add frame size change support for planar yuv format

Message ID 1582622114-6479-1-git-send-email-yejun.guo@intel.com
State Accepted
Headers show
Series [FFmpeg-devel,V2,1/3] avfilter/vf_dnn_processing.c: use swscale for uint8<->float32 convert | expand

Checks

Context Check Description
andriy/ffmpeg-patchwork success Make fate finished

Commit Message

Guo, Yejun Feb. 25, 2020, 9:15 a.m. UTC
The Y channel is handled by dnn, and also resized by dnn. The UV channels
are resized with swscale.

The command to use espcn.pb (see vf_sr) looks like:
./ffmpeg -i 480p.jpg -vf format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:input=x:output=y -y tmp.espcn.jpg

Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
---
 doc/filters.texi                |  9 +++++++++
 libavfilter/vf_dnn_processing.c | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 7 deletions(-)

Comments

Guo, Yejun March 5, 2020, 11:57 p.m. UTC | #1
> -----Original Message-----
> From: Guo, Yejun
> Sent: Tuesday, February 25, 2020 5:15 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Guo, Yejun <yejun.guo@intel.com>
> Subject: [PATCH V2 3/3] avfilter/vf_dnn_processing.c: add frame size change
> support for planar yuv format
> 
> The Y channel is handled by dnn, and also resized by dnn. The UV channels
> are resized with swscale.
> 
> The command to use espcn.pb (see vf_sr) looks like:
> ./ffmpeg -i 480p.jpg -vf
> format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:in
> put=x:output=y -y tmp.espcn.jpg
> 
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  doc/filters.texi                |  9 +++++++++
>  libavfilter/vf_dnn_processing.c | 37 ++++++++++++++++++++++++++++++-------
>  2 files changed, 39 insertions(+), 7 deletions(-)

this patch set asks for review, thanks.
Jun Zhao March 6, 2020, 2:49 a.m. UTC | #2
On Tue, Feb 25, 2020 at 5:24 PM Guo, Yejun <yejun.guo@intel.com> wrote:
>
> The Y channel is handled by dnn, and also resized by dnn. The UV channels
> are resized with swscale.
For me, this is a little weird to resize Y with dnn backend but resize
UV channel with FFmpeg swscale, is it used the same scale algorithm ?

> The command to use espcn.pb (see vf_sr) looks like:
> ./ffmpeg -i 480p.jpg -vf format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:input=x:output=y -y tmp.espcn.jpg
>
> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> ---
>  doc/filters.texi                |  9 +++++++++
>  libavfilter/vf_dnn_processing.c | 37 ++++++++++++++++++++++++++++++-------
>  2 files changed, 39 insertions(+), 7 deletions(-)
>
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 33b7857..e3df8f9 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -9155,6 +9155,7 @@ ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
>  @end example
>  @end itemize
>
> +@anchor{dnn_processing}
>  @section dnn_processing
>
>  Do image processing with deep neural networks. It works together with another filter
> @@ -9216,6 +9217,12 @@ Handle the Y channel with srcnn.pb (see @ref{sr} filter) for frame with yuv420p
>  ./ffmpeg -i 480p.jpg -vf format=yuv420p,scale=w=iw*2:h=ih*2,dnn_processing=dnn_backend=tensorflow:model=srcnn.pb:input=x:output=y -y srcnn.jpg
>  @end example
>
> +@item
> +Handle the Y channel with espcn.pb (see @ref{sr} filter), which changes frame size, for format yuv420p (planar YUV formats supported):
> +@example
> +./ffmpeg -i 480p.jpg -vf format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:input=x:output=y -y tmp.espcn.jpg
> +@end example
> +
>  @end itemize
>
>  @section drawbox
> @@ -17369,6 +17376,8 @@ Default value is @code{2}. Scale factor is necessary for SRCNN model, because it
>  input upscaled using bicubic upscaling with proper scale factor.
>  @end table
>
> +This feature can also be finished with @ref{dnn_processing} filter.
> +
>  @section ssim
>
>  Obtain the SSIM (Structural SImilarity Metric) between two input videos.
> diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
> index f9458f0..7f40f85 100644
> --- a/libavfilter/vf_dnn_processing.c
> +++ b/libavfilter/vf_dnn_processing.c
> @@ -51,6 +51,8 @@ typedef struct DnnProcessingContext {
>
>      struct SwsContext *sws_gray8_to_grayf32;
>      struct SwsContext *sws_grayf32_to_gray8;
> +    struct SwsContext *sws_uv_scale;
> +    int sws_uv_height;
>  } DnnProcessingContext;
>
>  #define OFFSET(x) offsetof(DnnProcessingContext, x)
> @@ -274,6 +276,18 @@ static int prepare_sws_context(AVFilterLink *outlink)
>                                                     outlink->h,
>                                                     AV_PIX_FMT_GRAY8,
>                                                     0, NULL, NULL, NULL);
> +
> +        if (inlink->w != outlink->w || inlink->h != outlink->h) {
> +            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
> +            int sws_src_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
> +            int sws_src_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
> +            int sws_dst_h = AV_CEIL_RSHIFT(outlink->h, desc->log2_chroma_h);
> +            int sws_dst_w = AV_CEIL_RSHIFT(outlink->w, desc->log2_chroma_w);
> +            ctx->sws_uv_scale = sws_getContext(sws_src_w, sws_src_h, AV_PIX_FMT_GRAY8,
> +                                               sws_dst_w, sws_dst_h, AV_PIX_FMT_GRAY8,
> +                                               SWS_BICUBIC, NULL, NULL, NULL);
> +            ctx->sws_uv_height = sws_src_h;
> +        }
>          return 0;
>      default:
>          //do nothing
> @@ -404,13 +418,21 @@ static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
>
>  static int copy_uv_planes(DnnProcessingContext *ctx, AVFrame *out, const AVFrame *in)
>  {
> -    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in->format);
> -    int uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
> -    for (int i = 1; i < 3; ++i) {
> -        int bytewidth = av_image_get_linesize(in->format, in->width, i);
> -        av_image_copy_plane(out->data[i], out->linesize[i],
> -                            in->data[i], in->linesize[i],
> -                            bytewidth, uv_height);
> +    if (!ctx->sws_uv_scale) {
> +        av_assert0(in->height == out->height && in->width == out->width);
> +        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in->format);
> +        int uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
> +        for (int i = 1; i < 3; ++i) {
> +            int bytewidth = av_image_get_linesize(in->format, in->width, i);
> +            av_image_copy_plane(out->data[i], out->linesize[i],
> +                                in->data[i], in->linesize[i],
> +                                bytewidth, uv_height);
> +        }
> +    } else {
> +        sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
> +                  0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
> +        sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 2), in->linesize + 2,
> +                  0, ctx->sws_uv_height, out->data + 2, out->linesize + 2);
>      }
>
>      return 0;
> @@ -455,6 +477,7 @@ static av_cold void uninit(AVFilterContext *ctx)
>
>      sws_freeContext(context->sws_gray8_to_grayf32);
>      sws_freeContext(context->sws_grayf32_to_gray8);
> +    sws_freeContext(context->sws_uv_scale);
>
>      if (context->dnn_module)
>          (context->dnn_module->free_model)(&context->model);
> --
> 2.7.4
>
Guo, Yejun March 6, 2020, 5:09 a.m. UTC | #3
> -----Original Message-----
> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> mypopy@gmail.com
> Sent: Friday, March 06, 2020 10:49 AM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V2 3/3] avfilter/vf_dnn_processing.c: add
> frame size change support for planar yuv format
> 
> On Tue, Feb 25, 2020 at 5:24 PM Guo, Yejun <yejun.guo@intel.com> wrote:
> >
> > The Y channel is handled by dnn, and also resized by dnn. The UV channels
> > are resized with swscale.
> For me, this is a little weird to resize Y with dnn backend but resize
> UV channel with FFmpeg swscale, is it used the same scale algorithm ?
> 

thanks for the review. The Y channel and UV channel use different algorithms,
the algorithm for Y channel is trained with dnn, while UV channel ignored in dnn model.

It would be nice if all channels are handled by the dnn model. But, for the popular
dnn models I know, they just handle the Y channel, that's the reason that we
have to ask swscale's help to handle UV channels. And the same idea is also
used in vf_sr.c, we can first handle this case.

I can add such support once I see there are some popular good models
which handle YUV channels together.

thanks
yejun
Pedro Arthur March 6, 2020, 5:24 p.m. UTC | #4
Em qui., 5 de mar. de 2020 às 20:57, Guo, Yejun <yejun.guo@intel.com> escreveu:
>
>
>
> > -----Original Message-----
> > From: Guo, Yejun
> > Sent: Tuesday, February 25, 2020 5:15 PM
> > To: ffmpeg-devel@ffmpeg.org
> > Cc: Guo, Yejun <yejun.guo@intel.com>
> > Subject: [PATCH V2 3/3] avfilter/vf_dnn_processing.c: add frame size change
> > support for planar yuv format
> >
> > The Y channel is handled by dnn, and also resized by dnn. The UV channels
> > are resized with swscale.
> >
> > The command to use espcn.pb (see vf_sr) looks like:
> > ./ffmpeg -i 480p.jpg -vf
> > format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:in
> > put=x:output=y -y tmp.espcn.jpg
> >
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  doc/filters.texi                |  9 +++++++++
> >  libavfilter/vf_dnn_processing.c | 37 ++++++++++++++++++++++++++++++-------
> >  2 files changed, 39 insertions(+), 7 deletions(-)
>
> this patch set asks for review, thanks.
I'll not be able to test it in the near future, but code wise LGTM.

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Pedro Arthur March 6, 2020, 6:01 p.m. UTC | #5
Em sex., 6 de mar. de 2020 às 00:52, mypopy@gmail.com
<mypopy@gmail.com> escreveu:
>
> On Tue, Feb 25, 2020 at 5:24 PM Guo, Yejun <yejun.guo@intel.com> wrote:
> >
> > The Y channel is handled by dnn, and also resized by dnn. The UV channels
> > are resized with swscale.
> For me, this is a little weird to resize Y with dnn backend but resize
> UV channel with FFmpeg swscale, is it used the same scale algorithm ?
Complementing Yejun's response, usually the luminance plane contains
most of the high frequency in "natural" images therefore most super
resolution methods are applied only to Y channel, which is cheaper
than applying it to all channels and yields almost as good results.

>
> > The command to use espcn.pb (see vf_sr) looks like:
> > ./ffmpeg -i 480p.jpg -vf format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:input=x:output=y -y tmp.espcn.jpg
> >
> > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > ---
> >  doc/filters.texi                |  9 +++++++++
> >  libavfilter/vf_dnn_processing.c | 37 ++++++++++++++++++++++++++++++-------
> >  2 files changed, 39 insertions(+), 7 deletions(-)
> >
> > diff --git a/doc/filters.texi b/doc/filters.texi
> > index 33b7857..e3df8f9 100644
> > --- a/doc/filters.texi
> > +++ b/doc/filters.texi
> > @@ -9155,6 +9155,7 @@ ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
> >  @end example
> >  @end itemize
> >
> > +@anchor{dnn_processing}
> >  @section dnn_processing
> >
> >  Do image processing with deep neural networks. It works together with another filter
> > @@ -9216,6 +9217,12 @@ Handle the Y channel with srcnn.pb (see @ref{sr} filter) for frame with yuv420p
> >  ./ffmpeg -i 480p.jpg -vf format=yuv420p,scale=w=iw*2:h=ih*2,dnn_processing=dnn_backend=tensorflow:model=srcnn.pb:input=x:output=y -y srcnn.jpg
> >  @end example
> >
> > +@item
> > +Handle the Y channel with espcn.pb (see @ref{sr} filter), which changes frame size, for format yuv420p (planar YUV formats supported):
> > +@example
> > +./ffmpeg -i 480p.jpg -vf format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:input=x:output=y -y tmp.espcn.jpg
> > +@end example
> > +
> >  @end itemize
> >
> >  @section drawbox
> > @@ -17369,6 +17376,8 @@ Default value is @code{2}. Scale factor is necessary for SRCNN model, because it
> >  input upscaled using bicubic upscaling with proper scale factor.
> >  @end table
> >
> > +This feature can also be finished with @ref{dnn_processing} filter.
> > +
> >  @section ssim
> >
> >  Obtain the SSIM (Structural SImilarity Metric) between two input videos.
> > diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
> > index f9458f0..7f40f85 100644
> > --- a/libavfilter/vf_dnn_processing.c
> > +++ b/libavfilter/vf_dnn_processing.c
> > @@ -51,6 +51,8 @@ typedef struct DnnProcessingContext {
> >
> >      struct SwsContext *sws_gray8_to_grayf32;
> >      struct SwsContext *sws_grayf32_to_gray8;
> > +    struct SwsContext *sws_uv_scale;
> > +    int sws_uv_height;
> >  } DnnProcessingContext;
> >
> >  #define OFFSET(x) offsetof(DnnProcessingContext, x)
> > @@ -274,6 +276,18 @@ static int prepare_sws_context(AVFilterLink *outlink)
> >                                                     outlink->h,
> >                                                     AV_PIX_FMT_GRAY8,
> >                                                     0, NULL, NULL, NULL);
> > +
> > +        if (inlink->w != outlink->w || inlink->h != outlink->h) {
> > +            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
> > +            int sws_src_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
> > +            int sws_src_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
> > +            int sws_dst_h = AV_CEIL_RSHIFT(outlink->h, desc->log2_chroma_h);
> > +            int sws_dst_w = AV_CEIL_RSHIFT(outlink->w, desc->log2_chroma_w);
> > +            ctx->sws_uv_scale = sws_getContext(sws_src_w, sws_src_h, AV_PIX_FMT_GRAY8,
> > +                                               sws_dst_w, sws_dst_h, AV_PIX_FMT_GRAY8,
> > +                                               SWS_BICUBIC, NULL, NULL, NULL);
> > +            ctx->sws_uv_height = sws_src_h;
> > +        }
> >          return 0;
> >      default:
> >          //do nothing
> > @@ -404,13 +418,21 @@ static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
> >
> >  static int copy_uv_planes(DnnProcessingContext *ctx, AVFrame *out, const AVFrame *in)
> >  {
> > -    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in->format);
> > -    int uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
> > -    for (int i = 1; i < 3; ++i) {
> > -        int bytewidth = av_image_get_linesize(in->format, in->width, i);
> > -        av_image_copy_plane(out->data[i], out->linesize[i],
> > -                            in->data[i], in->linesize[i],
> > -                            bytewidth, uv_height);
> > +    if (!ctx->sws_uv_scale) {
> > +        av_assert0(in->height == out->height && in->width == out->width);
> > +        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in->format);
> > +        int uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
> > +        for (int i = 1; i < 3; ++i) {
> > +            int bytewidth = av_image_get_linesize(in->format, in->width, i);
> > +            av_image_copy_plane(out->data[i], out->linesize[i],
> > +                                in->data[i], in->linesize[i],
> > +                                bytewidth, uv_height);
> > +        }
> > +    } else {
> > +        sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
> > +                  0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
> > +        sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 2), in->linesize + 2,
> > +                  0, ctx->sws_uv_height, out->data + 2, out->linesize + 2);
> >      }
> >
> >      return 0;
> > @@ -455,6 +477,7 @@ static av_cold void uninit(AVFilterContext *ctx)
> >
> >      sws_freeContext(context->sws_gray8_to_grayf32);
> >      sws_freeContext(context->sws_grayf32_to_gray8);
> > +    sws_freeContext(context->sws_uv_scale);
> >
> >      if (context->dnn_module)
> >          (context->dnn_module->free_model)(&context->model);
> > --
> > 2.7.4
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Guo, Yejun March 11, 2020, 6:30 a.m. UTC | #6
> -----Original Message-----
> From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of
> Pedro Arthur
> Sent: Saturday, March 07, 2020 1:25 AM
> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH V2 3/3] avfilter/vf_dnn_processing.c: add
> frame size change support for planar yuv format
> 
> Em qui., 5 de mar. de 2020 às 20:57, Guo, Yejun <yejun.guo@intel.com>
> escreveu:
> >
> >
> >
> > > -----Original Message-----
> > > From: Guo, Yejun
> > > Sent: Tuesday, February 25, 2020 5:15 PM
> > > To: ffmpeg-devel@ffmpeg.org
> > > Cc: Guo, Yejun <yejun.guo@intel.com>
> > > Subject: [PATCH V2 3/3] avfilter/vf_dnn_processing.c: add frame size
> change
> > > support for planar yuv format
> > >
> > > The Y channel is handled by dnn, and also resized by dnn. The UV channels
> > > are resized with swscale.
> > >
> > > The command to use espcn.pb (see vf_sr) looks like:
> > > ./ffmpeg -i 480p.jpg -vf
> > >
> format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:in
> > > put=x:output=y -y tmp.espcn.jpg
> > >
> > > Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
> > > ---
> > >  doc/filters.texi                |  9 +++++++++
> > >  libavfilter/vf_dnn_processing.c | 37
> ++++++++++++++++++++++++++++++-------
> > >  2 files changed, 39 insertions(+), 7 deletions(-)
> >
> > this patch set asks for review, thanks.
> I'll not be able to test it in the near future, but code wise LGTM.
> 

thanks, will push after 24 hours if no other comments.
diff mbox series

Patch

diff --git a/doc/filters.texi b/doc/filters.texi
index 33b7857..e3df8f9 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -9155,6 +9155,7 @@  ffmpeg -i INPUT -f lavfi -i nullsrc=hd720,geq='r=128+80*(sin(sqrt((X-W/2)*(X-W/2
 @end example
 @end itemize
 
+@anchor{dnn_processing}
 @section dnn_processing
 
 Do image processing with deep neural networks. It works together with another filter
@@ -9216,6 +9217,12 @@  Handle the Y channel with srcnn.pb (see @ref{sr} filter) for frame with yuv420p
 ./ffmpeg -i 480p.jpg -vf format=yuv420p,scale=w=iw*2:h=ih*2,dnn_processing=dnn_backend=tensorflow:model=srcnn.pb:input=x:output=y -y srcnn.jpg
 @end example
 
+@item
+Handle the Y channel with espcn.pb (see @ref{sr} filter), which changes frame size, for format yuv420p (planar YUV formats supported):
+@example
+./ffmpeg -i 480p.jpg -vf format=yuv420p,dnn_processing=dnn_backend=tensorflow:model=espcn.pb:input=x:output=y -y tmp.espcn.jpg
+@end example
+
 @end itemize
 
 @section drawbox
@@ -17369,6 +17376,8 @@  Default value is @code{2}. Scale factor is necessary for SRCNN model, because it
 input upscaled using bicubic upscaling with proper scale factor.
 @end table
 
+This feature can also be finished with @ref{dnn_processing} filter.
+
 @section ssim
 
 Obtain the SSIM (Structural SImilarity Metric) between two input videos.
diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
index f9458f0..7f40f85 100644
--- a/libavfilter/vf_dnn_processing.c
+++ b/libavfilter/vf_dnn_processing.c
@@ -51,6 +51,8 @@  typedef struct DnnProcessingContext {
 
     struct SwsContext *sws_gray8_to_grayf32;
     struct SwsContext *sws_grayf32_to_gray8;
+    struct SwsContext *sws_uv_scale;
+    int sws_uv_height;
 } DnnProcessingContext;
 
 #define OFFSET(x) offsetof(DnnProcessingContext, x)
@@ -274,6 +276,18 @@  static int prepare_sws_context(AVFilterLink *outlink)
                                                    outlink->h,
                                                    AV_PIX_FMT_GRAY8,
                                                    0, NULL, NULL, NULL);
+
+        if (inlink->w != outlink->w || inlink->h != outlink->h) {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+            int sws_src_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
+            int sws_src_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
+            int sws_dst_h = AV_CEIL_RSHIFT(outlink->h, desc->log2_chroma_h);
+            int sws_dst_w = AV_CEIL_RSHIFT(outlink->w, desc->log2_chroma_w);
+            ctx->sws_uv_scale = sws_getContext(sws_src_w, sws_src_h, AV_PIX_FMT_GRAY8,
+                                               sws_dst_w, sws_dst_h, AV_PIX_FMT_GRAY8,
+                                               SWS_BICUBIC, NULL, NULL, NULL);
+            ctx->sws_uv_height = sws_src_h;
+        }
         return 0;
     default:
         //do nothing
@@ -404,13 +418,21 @@  static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
 
 static int copy_uv_planes(DnnProcessingContext *ctx, AVFrame *out, const AVFrame *in)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in->format);
-    int uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
-    for (int i = 1; i < 3; ++i) {
-        int bytewidth = av_image_get_linesize(in->format, in->width, i);
-        av_image_copy_plane(out->data[i], out->linesize[i],
-                            in->data[i], in->linesize[i],
-                            bytewidth, uv_height);
+    if (!ctx->sws_uv_scale) {
+        av_assert0(in->height == out->height && in->width == out->width);
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in->format);
+        int uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
+        for (int i = 1; i < 3; ++i) {
+            int bytewidth = av_image_get_linesize(in->format, in->width, i);
+            av_image_copy_plane(out->data[i], out->linesize[i],
+                                in->data[i], in->linesize[i],
+                                bytewidth, uv_height);
+        }
+    } else {
+        sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
+                  0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
+        sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 2), in->linesize + 2,
+                  0, ctx->sws_uv_height, out->data + 2, out->linesize + 2);
     }
 
     return 0;
@@ -455,6 +477,7 @@  static av_cold void uninit(AVFilterContext *ctx)
 
     sws_freeContext(context->sws_gray8_to_grayf32);
     sws_freeContext(context->sws_grayf32_to_gray8);
+    sws_freeContext(context->sws_uv_scale);
 
     if (context->dnn_module)
         (context->dnn_module->free_model)(&context->model);