[FFmpeg-devel,1/3] lavc/qsvdec: add support for gpu_copy

Submitted by Linjie Fu on March 26, 2019, 5:38 a.m.

Details

Message ID 20190326053824.24220-1-linjie.fu@intel.com
State New
Headers show

Commit Message

Linjie Fu March 26, 2019, 5:38 a.m.
Add support for GPU copy when QSV decoders works in system memory mode.
However, memory must be sequent and aligned with 128x64 to enable this
feature.(first introduced in FFmpeg 3.3.1)

GPUCopy = MFX_GPUCOPY_ON leads to performance improvement up to x10.

CMD:
ffmpeg -init_hw_device qsv=hw -filter_hw_device hw -c:v h264_qsv
                    -gpu_copy on -i input.h264 -pix_fmt yuv420p out.yuv


Signed-off-by: Linjie Fu <linjie.fu@intel.com>
Signed-off-by: ChaoX A Liu <chaox.a.liu@intel.com>
---
 libavcodec/qsv.c          | 27 +++++++++++++-------
 libavcodec/qsv_internal.h |  6 ++---
 libavcodec/qsvdec.c       | 53 ++++++++++++++++++++++++++++++++++-----
 libavcodec/qsvdec.h       |  2 ++
 libavcodec/qsvdec_h2645.c | 10 ++++++++
 libavcodec/qsvdec_other.c |  5 ++++
 libavcodec/qsvenc.c       |  7 +++---
 7 files changed, 89 insertions(+), 21 deletions(-)

Comments

Linjie Fu April 8, 2019, 6:14 a.m.
> -----Original Message-----
> From: Fu, Linjie
> Sent: Tuesday, March 26, 2019 13:38
> To: ffmpeg-devel@ffmpeg.org
> Cc: Fu, Linjie <linjie.fu@intel.com>; ChaoX A Liu <chaox.a.liu@intel.com>
> Subject: [PATCH 1/3] lavc/qsvdec: add support for gpu_copy
> 
> Add support for GPU copy when QSV decoders works in system memory
> mode.
> However, memory must be sequent and aligned with 128x64 to enable this
> feature.(first introduced in FFmpeg 3.3.1)
> 
> GPUCopy = MFX_GPUCOPY_ON leads to performance improvement up to
> x10.
> 
> CMD:
> ffmpeg -init_hw_device qsv=hw -filter_hw_device hw -c:v h264_qsv
>                     -gpu_copy on -i input.h264 -pix_fmt yuv420p out.yuv
> 
> 
> Signed-off-by: Linjie Fu <linjie.fu@intel.com>
> Signed-off-by: ChaoX A Liu <chaox.a.liu@intel.com>
> ---
>  libavcodec/qsv.c          | 27 +++++++++++++-------
>  libavcodec/qsv_internal.h |  6 ++---
>  libavcodec/qsvdec.c       | 53 ++++++++++++++++++++++++++++++++++----
> -
>  libavcodec/qsvdec.h       |  2 ++
>  libavcodec/qsvdec_h2645.c | 10 ++++++++
>  libavcodec/qsvdec_other.c |  5 ++++
>  libavcodec/qsvenc.c       |  7 +++---
>  7 files changed, 89 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/qsv.c b/libavcodec/qsv.c
> index bb0d79588c..40e6c677cb 100644
> --- a/libavcodec/qsv.c
> +++ b/libavcodec/qsv.c
> @@ -277,15 +277,19 @@ load_plugin_fail:
>  }
> 
>  int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession
> *session,
> -                                 const char *load_plugins)
> +                                 const char *load_plugins, int gpu_copy)
>  {
> -    mfxIMPL impl   = MFX_IMPL_AUTO_ANY;
> -    mfxVersion ver = { { QSV_VERSION_MINOR, QSV_VERSION_MAJOR } };
> +    mfxIMPL          impl = MFX_IMPL_AUTO_ANY;
> +    mfxVersion        ver = { { QSV_VERSION_MINOR,
> QSV_VERSION_MAJOR } };
> +    mfxInitParam init_par = { MFX_IMPL_AUTO_ANY };
> 
>      const char *desc;
>      int ret;
> 
> -    ret = MFXInit(impl, &ver, session);
> +    init_par.GPUCopy        = gpu_copy;
> +    init_par.Implementation = impl;
> +    init_par.Version        = ver;
> +    ret = MFXInitEx(init_par, session);
>      if (ret < 0)
>          return ff_qsv_print_error(avctx, ret,
>                                    "Error initializing an internal MFX session");
> @@ -571,7 +575,8 @@ static mfxStatus qsv_frame_get_hdl(mfxHDL pthis,
> mfxMemId mid, mfxHDL *hdl)
>  }
> 
>  int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession
> *psession,
> -                               AVBufferRef *device_ref, const char *load_plugins)
> +                               AVBufferRef *device_ref, const char *load_plugins,
> +                               int gpu_copy)
>  {
>      static const mfxHandleType handle_types[] = {
>          MFX_HANDLE_VA_DISPLAY,
> @@ -581,11 +586,12 @@ int ff_qsv_init_session_device(AVCodecContext
> *avctx, mfxSession *psession,
>      AVHWDeviceContext    *device_ctx = (AVHWDeviceContext*)device_ref-
> >data;
>      AVQSVDeviceContext *device_hwctx = device_ctx->hwctx;
>      mfxSession        parent_session = device_hwctx->session;
> +    mfxInitParam            init_par = { MFX_IMPL_AUTO_ANY };
> +    mfxHDL                    handle = NULL;
> 
>      mfxSession    session;
>      mfxVersion    ver;
>      mfxIMPL       impl;
> -    mfxHDL        handle = NULL;
>      mfxHandleType handle_type;
>      mfxStatus err;
> 
> @@ -611,7 +617,10 @@ int ff_qsv_init_session_device(AVCodecContext
> *avctx, mfxSession *psession,
>                 "from the session\n");
>      }
> 
> -    err = MFXInit(impl, &ver, &session);
> +    init_par.GPUCopy        = gpu_copy;
> +    init_par.Implementation = impl;
> +    init_par.Version        = ver;
> +    err = MFXInitEx(init_par, &session);
>      if (err != MFX_ERR_NONE)
>          return ff_qsv_print_error(avctx, err,
>                                    "Error initializing a child MFX session");
> @@ -642,7 +651,7 @@ int ff_qsv_init_session_device(AVCodecContext
> *avctx, mfxSession *psession,
> 
>  int ff_qsv_init_session_frames(AVCodecContext *avctx, mfxSession
> *psession,
>                                 QSVFramesContext *qsv_frames_ctx,
> -                               const char *load_plugins, int opaque)
> +                               const char *load_plugins, int opaque, int gpu_copy)
>  {
>      mfxFrameAllocator frame_allocator = {
>          .pthis  = qsv_frames_ctx,
> @@ -662,7 +671,7 @@ int ff_qsv_init_session_frames(AVCodecContext
> *avctx, mfxSession *psession,
>      int ret;
> 
>      ret = ff_qsv_init_session_device(avctx, &session,
> -                                     frames_ctx->device_ref, load_plugins);
> +                                     frames_ctx->device_ref, load_plugins, gpu_copy);
>      if (ret < 0)
>          return ret;
> 
> diff --git a/libavcodec/qsv_internal.h b/libavcodec/qsv_internal.h
> index 394c558883..8be6c3757c 100644
> --- a/libavcodec/qsv_internal.h
> +++ b/libavcodec/qsv_internal.h
> @@ -95,14 +95,14 @@ int ff_qsv_map_pixfmt(enum AVPixelFormat format,
> uint32_t *fourcc);
>  enum AVPictureType ff_qsv_map_pictype(int mfx_pic_type);
> 
>  int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession
> *session,
> -                                 const char *load_plugins);
> +                                 const char *load_plugins, int gpu_copy);
> 
>  int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession
> *psession,
> -                               AVBufferRef *device_ref, const char *load_plugins);
> +                               AVBufferRef *device_ref, const char *load_plugins, int
> gpu_copy);
> 
>  int ff_qsv_init_session_frames(AVCodecContext *avctx, mfxSession
> *session,
>                                 QSVFramesContext *qsv_frames_ctx,
> -                               const char *load_plugins, int opaque);
> +                               const char *load_plugins, int opaque, int gpu_copy);
> 
>  int ff_qsv_find_surface_idx(QSVFramesContext *ctx, QSVFrame *frame);
> 
> diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
> index 4a0be811fb..5dd2b3834b 100644
> --- a/libavcodec/qsvdec.c
> +++ b/libavcodec/qsvdec.c
> @@ -34,9 +34,11 @@
>  #include "libavutil/pixdesc.h"
>  #include "libavutil/pixfmt.h"
>  #include "libavutil/time.h"
> +#include "libavutil/imgutils.h"
> 
>  #include "avcodec.h"
>  #include "internal.h"
> +#include "decode.h"
>  #include "qsv.h"
>  #include "qsv_internal.h"
>  #include "qsvdec.h"
> @@ -54,6 +56,31 @@ const AVCodecHWConfigInternal *ff_qsv_hw_configs[]
> = {
>      NULL
>  };
> 
> +static int ff_qsv_get_continuous_buffer(AVCodecContext *avctx, AVFrame
> *frame, AVBufferPool *pool)
> +{
> +    int ret = 0;
> +
> +    ff_decode_frame_props(avctx, frame);
> +
> +    frame->width       = avctx->width;
> +    frame->height      = avctx->height;
> +    frame->linesize[0] = FFALIGN(avctx->width, 128);
> +    frame->linesize[1] = frame->linesize[0];
> +    frame->buf[0]      = av_buffer_pool_get(pool);
> +    if (!frame->buf[0])
> +        return AVERROR(ENOMEM);
> +
> +    frame->data[0] = frame->buf[0]->data;
> +    frame->data[1] = frame->data[0] +
> +                            frame->linesize[0] * FFALIGN(avctx->height, 64);
> +
> +    ret = ff_attach_decode_data(frame);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
>  static int qsv_init_session(AVCodecContext *avctx, QSVContext *q,
> mfxSession session,
>                              AVBufferRef *hw_frames_ref, AVBufferRef *hw_device_ref)
>  {
> @@ -74,7 +101,8 @@ static int qsv_init_session(AVCodecContext *avctx,
> QSVContext *q, mfxSession ses
> 
>          ret = ff_qsv_init_session_frames(avctx, &q->internal_session,
>                                           &q->frames_ctx, q->load_plugins,
> -                                         q->iopattern ==
> MFX_IOPATTERN_OUT_OPAQUE_MEMORY);
> +                                         q->iopattern ==
> MFX_IOPATTERN_OUT_OPAQUE_MEMORY,
> +                                         q->gpu_copy);
>          if (ret < 0) {
>              av_buffer_unref(&q->frames_ctx.hw_frames_ctx);
>              return ret;
> @@ -88,7 +116,7 @@ static int qsv_init_session(AVCodecContext *avctx,
> QSVContext *q, mfxSession ses
>          }
> 
>          ret = ff_qsv_init_session_device(avctx, &q->internal_session,
> -                                         hw_device_ref, q->load_plugins);
> +                                         hw_device_ref, q->load_plugins, q->gpu_copy);
>          if (ret < 0)
>              return ret;
> 
> @@ -96,7 +124,7 @@ static int qsv_init_session(AVCodecContext *avctx,
> QSVContext *q, mfxSession ses
>      } else {
>          if (!q->internal_session) {
>              ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
> -                                               q->load_plugins);
> +                                               q->load_plugins, q->gpu_copy);
>              if (ret < 0)
>                  return ret;
>          }
> @@ -213,6 +241,12 @@ static int qsv_decode_init(AVCodecContext *avctx,
> QSVContext *q)
> 
>      q->frame_info = param.mfx.FrameInfo;
> 
> +    if (avctx->pix_fmt != AV_PIX_FMT_QSV)
> +        q->pool = av_buffer_pool_init(av_image_get_buffer_size(avctx-
> >pix_fmt,
> +                                                         FFALIGN(avctx->width, 128),
> +                                                         FFALIGN(avctx->height, 64), 1),
> +                                      av_buffer_allocz);
> +
>      return 0;
>  }
> 
> @@ -220,9 +254,15 @@ static int alloc_frame(AVCodecContext *avctx,
> QSVContext *q, QSVFrame *frame)
>  {
>      int ret;
> 
> -    ret = ff_get_buffer(avctx, frame->frame, AV_GET_BUFFER_FLAG_REF);
> -    if (ret < 0)
> -        return ret;
> +    if (!q->pool) {
> +        ret = ff_get_buffer(avctx, frame->frame, AV_GET_BUFFER_FLAG_REF);
> +        if (ret < 0)
> +            return ret;
> +    } else {
> +        ret = ff_qsv_get_continuous_buffer(avctx, frame->frame, q->pool);
> +        if (ret < 0)
> +            return ret;
> +    }
> 
>      if (frame->frame->format == AV_PIX_FMT_QSV) {
>          frame->surface = *(mfxFrameSurface1*)frame->frame->data[3];
> @@ -484,6 +524,7 @@ int ff_qsv_decode_close(QSVContext *q)
> 
>      av_buffer_unref(&q->frames_ctx.hw_frames_ctx);
>      av_buffer_unref(&q->frames_ctx.mids_buf);
> +    av_buffer_pool_uninit(&q->pool);
> 
>      return 0;
>  }
> diff --git a/libavcodec/qsvdec.h b/libavcodec/qsvdec.h
> index 111536caba..43ea03867e 100644
> --- a/libavcodec/qsvdec.h
> +++ b/libavcodec/qsvdec.h
> @@ -62,10 +62,12 @@ typedef struct QSVContext {
>      enum AVPixelFormat orig_pix_fmt;
>      uint32_t fourcc;
>      mfxFrameInfo frame_info;
> +    AVBufferPool *pool;
> 
>      // options set by the caller
>      int async_depth;
>      int iopattern;
> +    int gpu_copy;
> 
>      char *load_plugins;
> 
> diff --git a/libavcodec/qsvdec_h2645.c b/libavcodec/qsvdec_h2645.c
> index 9b49f5506e..3d1f1cbfac 100644
> --- a/libavcodec/qsvdec_h2645.c
> +++ b/libavcodec/qsvdec_h2645.c
> @@ -192,6 +192,11 @@ static const AVOption hevc_options[] = {
> 
>      { "load_plugins", "A :-separate list of hexadecimal plugin UIDs to load in an
> internal session",
>          OFFSET(qsv.load_plugins), AV_OPT_TYPE_STRING, { .str = "" }, 0, 0, VD },
> +
> +    { "gpu_copy", "A GPU-accelerated memory copy for non-QSV pipelines",
> OFFSET(qsv.gpu_copy), AV_OPT_TYPE_INT, { .i64 =
> MFX_GPUCOPY_DEFAULT }, MFX_GPUCOPY_DEFAULT,
> MFX_GPUCOPY_OFF, VD, "gpu_copy"},
> +        { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> MFX_GPUCOPY_DEFAULT }, 0, 0, VD, "gpu_copy"},
> +        { "on",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_ON },
> 0, 0, VD, "gpu_copy"},
> +        { "off",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_OFF },
> 0, 0, VD, "gpu_copy"},
>      { NULL },
>  };
> 
> @@ -227,6 +232,11 @@ AVCodec ff_hevc_qsv_decoder = {
>  #if CONFIG_H264_QSV_DECODER
>  static const AVOption options[] = {
>      { "async_depth", "Internal parallelization depth, the higher the value the
> higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 =
> ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VD },
> +
> +    { "gpu_copy", "A GPU-accelerated memory copy for non-QSV pipelines",
> OFFSET(qsv.gpu_copy), AV_OPT_TYPE_INT, { .i64 =
> MFX_GPUCOPY_DEFAULT }, MFX_GPUCOPY_DEFAULT,
> MFX_GPUCOPY_OFF, VD, "gpu_copy"},
> +    { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> MFX_GPUCOPY_DEFAULT }, 0, 0, VD, "gpu_copy"},
> +    { "on",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_ON },
> 0, 0, VD, "gpu_copy"},
> +    { "off",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_OFF },
> 0, 0, VD, "gpu_copy"},
>      { NULL },
>  };
> 
> diff --git a/libavcodec/qsvdec_other.c b/libavcodec/qsvdec_other.c
> index 03251d2c85..37237180fb 100644
> --- a/libavcodec/qsvdec_other.c
> +++ b/libavcodec/qsvdec_other.c
> @@ -169,6 +169,11 @@ static void qsv_decode_flush(AVCodecContext
> *avctx)
>  #define VD AV_OPT_FLAG_VIDEO_PARAM |
> AV_OPT_FLAG_DECODING_PARAM
>  static const AVOption options[] = {
>      { "async_depth", "Internal parallelization depth, the higher the value the
> higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 =
> ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VD },
> +
> +    { "gpu_copy", "A GPU-accelerated memory copy for non-QSV pipelines",
> OFFSET(qsv.gpu_copy), AV_OPT_TYPE_INT, { .i64 =
> MFX_GPUCOPY_DEFAULT }, MFX_GPUCOPY_DEFAULT,
> MFX_GPUCOPY_OFF, VD, "gpu_copy"},
> +    { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> MFX_GPUCOPY_DEFAULT }, 0, 0, VD, "gpu_copy"},
> +    { "on",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_ON },
> 0, 0, VD, "gpu_copy"},
> +    { "off",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_OFF },
> 0, 0, VD, "gpu_copy"},
>      { NULL },
>  };
> 
> diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
> index 5aa020d47b..3d008ed527 100644
> --- a/libavcodec/qsvenc.c
> +++ b/libavcodec/qsvenc.c
> @@ -909,7 +909,8 @@ static int qsvenc_init_session(AVCodecContext
> *avctx, QSVEncContext *q)
> 
>          ret = ff_qsv_init_session_frames(avctx, &q->internal_session,
>                                           &q->frames_ctx, q->load_plugins,
> -                                         q->param.IOPattern ==
> MFX_IOPATTERN_IN_OPAQUE_MEMORY);
> +                                         q->param.IOPattern ==
> MFX_IOPATTERN_IN_OPAQUE_MEMORY,
> +                                         MFX_GPUCOPY_OFF);
>          if (ret < 0) {
>              av_buffer_unref(&q->frames_ctx.hw_frames_ctx);
>              return ret;
> @@ -918,14 +919,14 @@ static int qsvenc_init_session(AVCodecContext
> *avctx, QSVEncContext *q)
>          q->session = q->internal_session;
>      } else if (avctx->hw_device_ctx) {
>          ret = ff_qsv_init_session_device(avctx, &q->internal_session,
> -                                         avctx->hw_device_ctx, q->load_plugins);
> +                                         avctx->hw_device_ctx, q->load_plugins,
> MFX_GPUCOPY_OFF);
>          if (ret < 0)
>              return ret;
> 
>          q->session = q->internal_session;
>      } else {
>          ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
> -                                           q->load_plugins);
> +                                           q->load_plugins, MFX_GPUCOPY_OFF);
>          if (ret < 0)
>              return ret;
> 
> --
> 2.17.1

Ping?
Any comments for this patch set?
The decode performance can be improved obviously on some platform(6x for example)

Patch hide | download patch | download mbox

diff --git a/libavcodec/qsv.c b/libavcodec/qsv.c
index bb0d79588c..40e6c677cb 100644
--- a/libavcodec/qsv.c
+++ b/libavcodec/qsv.c
@@ -277,15 +277,19 @@  load_plugin_fail:
 }
 
 int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
-                                 const char *load_plugins)
+                                 const char *load_plugins, int gpu_copy)
 {
-    mfxIMPL impl   = MFX_IMPL_AUTO_ANY;
-    mfxVersion ver = { { QSV_VERSION_MINOR, QSV_VERSION_MAJOR } };
+    mfxIMPL          impl = MFX_IMPL_AUTO_ANY;
+    mfxVersion        ver = { { QSV_VERSION_MINOR, QSV_VERSION_MAJOR } };
+    mfxInitParam init_par = { MFX_IMPL_AUTO_ANY };
 
     const char *desc;
     int ret;
 
-    ret = MFXInit(impl, &ver, session);
+    init_par.GPUCopy        = gpu_copy;
+    init_par.Implementation = impl;
+    init_par.Version        = ver;
+    ret = MFXInitEx(init_par, session);
     if (ret < 0)
         return ff_qsv_print_error(avctx, ret,
                                   "Error initializing an internal MFX session");
@@ -571,7 +575,8 @@  static mfxStatus qsv_frame_get_hdl(mfxHDL pthis, mfxMemId mid, mfxHDL *hdl)
 }
 
 int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession *psession,
-                               AVBufferRef *device_ref, const char *load_plugins)
+                               AVBufferRef *device_ref, const char *load_plugins,
+                               int gpu_copy)
 {
     static const mfxHandleType handle_types[] = {
         MFX_HANDLE_VA_DISPLAY,
@@ -581,11 +586,12 @@  int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession *psession,
     AVHWDeviceContext    *device_ctx = (AVHWDeviceContext*)device_ref->data;
     AVQSVDeviceContext *device_hwctx = device_ctx->hwctx;
     mfxSession        parent_session = device_hwctx->session;
+    mfxInitParam            init_par = { MFX_IMPL_AUTO_ANY };
+    mfxHDL                    handle = NULL;
 
     mfxSession    session;
     mfxVersion    ver;
     mfxIMPL       impl;
-    mfxHDL        handle = NULL;
     mfxHandleType handle_type;
     mfxStatus err;
 
@@ -611,7 +617,10 @@  int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession *psession,
                "from the session\n");
     }
 
-    err = MFXInit(impl, &ver, &session);
+    init_par.GPUCopy        = gpu_copy;
+    init_par.Implementation = impl;
+    init_par.Version        = ver;
+    err = MFXInitEx(init_par, &session);
     if (err != MFX_ERR_NONE)
         return ff_qsv_print_error(avctx, err,
                                   "Error initializing a child MFX session");
@@ -642,7 +651,7 @@  int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession *psession,
 
 int ff_qsv_init_session_frames(AVCodecContext *avctx, mfxSession *psession,
                                QSVFramesContext *qsv_frames_ctx,
-                               const char *load_plugins, int opaque)
+                               const char *load_plugins, int opaque, int gpu_copy)
 {
     mfxFrameAllocator frame_allocator = {
         .pthis  = qsv_frames_ctx,
@@ -662,7 +671,7 @@  int ff_qsv_init_session_frames(AVCodecContext *avctx, mfxSession *psession,
     int ret;
 
     ret = ff_qsv_init_session_device(avctx, &session,
-                                     frames_ctx->device_ref, load_plugins);
+                                     frames_ctx->device_ref, load_plugins, gpu_copy);
     if (ret < 0)
         return ret;
 
diff --git a/libavcodec/qsv_internal.h b/libavcodec/qsv_internal.h
index 394c558883..8be6c3757c 100644
--- a/libavcodec/qsv_internal.h
+++ b/libavcodec/qsv_internal.h
@@ -95,14 +95,14 @@  int ff_qsv_map_pixfmt(enum AVPixelFormat format, uint32_t *fourcc);
 enum AVPictureType ff_qsv_map_pictype(int mfx_pic_type);
 
 int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
-                                 const char *load_plugins);
+                                 const char *load_plugins, int gpu_copy);
 
 int ff_qsv_init_session_device(AVCodecContext *avctx, mfxSession *psession,
-                               AVBufferRef *device_ref, const char *load_plugins);
+                               AVBufferRef *device_ref, const char *load_plugins, int gpu_copy);
 
 int ff_qsv_init_session_frames(AVCodecContext *avctx, mfxSession *session,
                                QSVFramesContext *qsv_frames_ctx,
-                               const char *load_plugins, int opaque);
+                               const char *load_plugins, int opaque, int gpu_copy);
 
 int ff_qsv_find_surface_idx(QSVFramesContext *ctx, QSVFrame *frame);
 
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index 4a0be811fb..5dd2b3834b 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -34,9 +34,11 @@ 
 #include "libavutil/pixdesc.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/time.h"
+#include "libavutil/imgutils.h"
 
 #include "avcodec.h"
 #include "internal.h"
+#include "decode.h"
 #include "qsv.h"
 #include "qsv_internal.h"
 #include "qsvdec.h"
@@ -54,6 +56,31 @@  const AVCodecHWConfigInternal *ff_qsv_hw_configs[] = {
     NULL
 };
 
+static int ff_qsv_get_continuous_buffer(AVCodecContext *avctx, AVFrame *frame, AVBufferPool *pool)
+{
+    int ret = 0;
+
+    ff_decode_frame_props(avctx, frame);
+
+    frame->width       = avctx->width;
+    frame->height      = avctx->height;
+    frame->linesize[0] = FFALIGN(avctx->width, 128);
+    frame->linesize[1] = frame->linesize[0];
+    frame->buf[0]      = av_buffer_pool_get(pool);
+    if (!frame->buf[0])
+        return AVERROR(ENOMEM);
+
+    frame->data[0] = frame->buf[0]->data;
+    frame->data[1] = frame->data[0] +
+                            frame->linesize[0] * FFALIGN(avctx->height, 64);
+
+    ret = ff_attach_decode_data(frame);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
 static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession session,
                             AVBufferRef *hw_frames_ref, AVBufferRef *hw_device_ref)
 {
@@ -74,7 +101,8 @@  static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession ses
 
         ret = ff_qsv_init_session_frames(avctx, &q->internal_session,
                                          &q->frames_ctx, q->load_plugins,
-                                         q->iopattern == MFX_IOPATTERN_OUT_OPAQUE_MEMORY);
+                                         q->iopattern == MFX_IOPATTERN_OUT_OPAQUE_MEMORY,
+                                         q->gpu_copy);
         if (ret < 0) {
             av_buffer_unref(&q->frames_ctx.hw_frames_ctx);
             return ret;
@@ -88,7 +116,7 @@  static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession ses
         }
 
         ret = ff_qsv_init_session_device(avctx, &q->internal_session,
-                                         hw_device_ref, q->load_plugins);
+                                         hw_device_ref, q->load_plugins, q->gpu_copy);
         if (ret < 0)
             return ret;
 
@@ -96,7 +124,7 @@  static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession ses
     } else {
         if (!q->internal_session) {
             ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
-                                               q->load_plugins);
+                                               q->load_plugins, q->gpu_copy);
             if (ret < 0)
                 return ret;
         }
@@ -213,6 +241,12 @@  static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q)
 
     q->frame_info = param.mfx.FrameInfo;
 
+    if (avctx->pix_fmt != AV_PIX_FMT_QSV)
+        q->pool = av_buffer_pool_init(av_image_get_buffer_size(avctx->pix_fmt,
+                                                         FFALIGN(avctx->width, 128),
+                                                         FFALIGN(avctx->height, 64), 1),
+                                      av_buffer_allocz);
+
     return 0;
 }
 
@@ -220,9 +254,15 @@  static int alloc_frame(AVCodecContext *avctx, QSVContext *q, QSVFrame *frame)
 {
     int ret;
 
-    ret = ff_get_buffer(avctx, frame->frame, AV_GET_BUFFER_FLAG_REF);
-    if (ret < 0)
-        return ret;
+    if (!q->pool) {
+        ret = ff_get_buffer(avctx, frame->frame, AV_GET_BUFFER_FLAG_REF);
+        if (ret < 0)
+            return ret;
+    } else {
+        ret = ff_qsv_get_continuous_buffer(avctx, frame->frame, q->pool);
+        if (ret < 0)
+            return ret;
+    }
 
     if (frame->frame->format == AV_PIX_FMT_QSV) {
         frame->surface = *(mfxFrameSurface1*)frame->frame->data[3];
@@ -484,6 +524,7 @@  int ff_qsv_decode_close(QSVContext *q)
 
     av_buffer_unref(&q->frames_ctx.hw_frames_ctx);
     av_buffer_unref(&q->frames_ctx.mids_buf);
+    av_buffer_pool_uninit(&q->pool);
 
     return 0;
 }
diff --git a/libavcodec/qsvdec.h b/libavcodec/qsvdec.h
index 111536caba..43ea03867e 100644
--- a/libavcodec/qsvdec.h
+++ b/libavcodec/qsvdec.h
@@ -62,10 +62,12 @@  typedef struct QSVContext {
     enum AVPixelFormat orig_pix_fmt;
     uint32_t fourcc;
     mfxFrameInfo frame_info;
+    AVBufferPool *pool;
 
     // options set by the caller
     int async_depth;
     int iopattern;
+    int gpu_copy;
 
     char *load_plugins;
 
diff --git a/libavcodec/qsvdec_h2645.c b/libavcodec/qsvdec_h2645.c
index 9b49f5506e..3d1f1cbfac 100644
--- a/libavcodec/qsvdec_h2645.c
+++ b/libavcodec/qsvdec_h2645.c
@@ -192,6 +192,11 @@  static const AVOption hevc_options[] = {
 
     { "load_plugins", "A :-separate list of hexadecimal plugin UIDs to load in an internal session",
         OFFSET(qsv.load_plugins), AV_OPT_TYPE_STRING, { .str = "" }, 0, 0, VD },
+
+    { "gpu_copy", "A GPU-accelerated memory copy for non-QSV pipelines", OFFSET(qsv.gpu_copy), AV_OPT_TYPE_INT, { .i64 = MFX_GPUCOPY_DEFAULT }, MFX_GPUCOPY_DEFAULT, MFX_GPUCOPY_OFF, VD, "gpu_copy"},
+        { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_DEFAULT }, 0, 0, VD, "gpu_copy"},
+        { "on",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_ON },      0, 0, VD, "gpu_copy"},
+        { "off",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_OFF },     0, 0, VD, "gpu_copy"},
     { NULL },
 };
 
@@ -227,6 +232,11 @@  AVCodec ff_hevc_qsv_decoder = {
 #if CONFIG_H264_QSV_DECODER
 static const AVOption options[] = {
     { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VD },
+
+    { "gpu_copy", "A GPU-accelerated memory copy for non-QSV pipelines", OFFSET(qsv.gpu_copy), AV_OPT_TYPE_INT, { .i64 = MFX_GPUCOPY_DEFAULT }, MFX_GPUCOPY_DEFAULT, MFX_GPUCOPY_OFF, VD, "gpu_copy"},
+    { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_DEFAULT }, 0, 0, VD, "gpu_copy"},
+    { "on",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_ON },      0, 0, VD, "gpu_copy"},
+    { "off",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_OFF },     0, 0, VD, "gpu_copy"},
     { NULL },
 };
 
diff --git a/libavcodec/qsvdec_other.c b/libavcodec/qsvdec_other.c
index 03251d2c85..37237180fb 100644
--- a/libavcodec/qsvdec_other.c
+++ b/libavcodec/qsvdec_other.c
@@ -169,6 +169,11 @@  static void qsv_decode_flush(AVCodecContext *avctx)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption options[] = {
     { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VD },
+
+    { "gpu_copy", "A GPU-accelerated memory copy for non-QSV pipelines", OFFSET(qsv.gpu_copy), AV_OPT_TYPE_INT, { .i64 = MFX_GPUCOPY_DEFAULT }, MFX_GPUCOPY_DEFAULT, MFX_GPUCOPY_OFF, VD, "gpu_copy"},
+    { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_DEFAULT }, 0, 0, VD, "gpu_copy"},
+    { "on",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_ON },      0, 0, VD, "gpu_copy"},
+    { "off",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_GPUCOPY_OFF },     0, 0, VD, "gpu_copy"},
     { NULL },
 };
 
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index 5aa020d47b..3d008ed527 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -909,7 +909,8 @@  static int qsvenc_init_session(AVCodecContext *avctx, QSVEncContext *q)
 
         ret = ff_qsv_init_session_frames(avctx, &q->internal_session,
                                          &q->frames_ctx, q->load_plugins,
-                                         q->param.IOPattern == MFX_IOPATTERN_IN_OPAQUE_MEMORY);
+                                         q->param.IOPattern == MFX_IOPATTERN_IN_OPAQUE_MEMORY,
+                                         MFX_GPUCOPY_OFF);
         if (ret < 0) {
             av_buffer_unref(&q->frames_ctx.hw_frames_ctx);
             return ret;
@@ -918,14 +919,14 @@  static int qsvenc_init_session(AVCodecContext *avctx, QSVEncContext *q)
         q->session = q->internal_session;
     } else if (avctx->hw_device_ctx) {
         ret = ff_qsv_init_session_device(avctx, &q->internal_session,
-                                         avctx->hw_device_ctx, q->load_plugins);
+                                         avctx->hw_device_ctx, q->load_plugins, MFX_GPUCOPY_OFF);
         if (ret < 0)
             return ret;
 
         q->session = q->internal_session;
     } else {
         ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
-                                           q->load_plugins);
+                                           q->load_plugins, MFX_GPUCOPY_OFF);
         if (ret < 0)
             return ret;