diff mbox

[FFmpeg-devel,3/6] lavc/qsv: Enable hwaccel qsv_vidmem.

Message ID 1472108839-22207-4-git-send-email-sdk@nablet.com
State Changes Requested
Headers show

Commit Message

Nablet Developer Aug. 25, 2016, 7:07 a.m. UTC
From: ChaoX A Liu <chaox.a.liu@intel.com>

Signed-off-by: ChaoX A Liu <chaox.a.liu@intel.com>
---
 ffmpeg.c                  |   2 +-
 ffmpeg.h                  |   2 +
 ffmpeg_opt.c              |   2 +-
 ffmpeg_qsv.c              | 636 +++++++++++++++++++++++++++++++++++++++++++++-
 libavcodec/qsv.h          |   3 +
 libavcodec/qsv_internal.h |   2 +
 libavcodec/qsvdec.c       |   5 +-
 libavcodec/qsvenc.c       |   2 +
 8 files changed, 649 insertions(+), 5 deletions(-)

Comments

Jean-Baptiste Kempf Aug. 25, 2016, 7:26 a.m. UTC | #1
On 25 Aug, Nablet Developer wrote :
> From: ChaoX A Liu <chaox.a.liu@intel.com>

what is vidmem?

> diff --git a/ffmpeg.c b/ffmpeg.c
> diff --git a/ffmpeg.h b/ffmpeg.h

I doubt this should be in the same commit.

> +INTEL CORPORATION PROPRIETARY INFORMATION
> +This software is supplied under the terms of a license agreement or nondisclosure
> +agreement with Intel Corporation and may not be copied or disclosed except in
> +accordance with the terms of that agreement
> +Copyright(c) 2011-2014 Intel Corporation. All Rights Reserved.

WHAT?

> diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
> index b9de0af..47dd818 100644
> --- a/libavcodec/qsvdec.c
> +++ b/libavcodec/qsvdec.c
> @@ -72,6 +72,7 @@ static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt
>          q->iopattern      = qsv->iopattern;
>          q->ext_buffers    = qsv->ext_buffers;
>          q->nb_ext_buffers = qsv->nb_ext_buffers;
> +        qsv->nb_decoder_surfaces = q->async_depth;
>      }
>      if (!q->session) {
>          if (!q->internal_qs.session) {
> @@ -88,7 +89,7 @@ static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt
>          if (ret < 0) {
>              av_log(avctx, AV_LOG_ERROR, "Failed to load plugins %s, ret = %s\n",
>                      q->load_plugins, av_err2str(ret));
> -            return ff_qsv_error(ret);
> +            return ret;
>          }
>      }
>  
> @@ -149,7 +150,7 @@ static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt
>         So weare  pre-allocating fifo big enough for 17 elements:
>       */
>      if (!q->async_fifo) {
> -        q->async_fifo = av_fifo_alloc((1 + 16) *
> +        q->async_fifo = av_fifo_alloc((1 + 16 + q->async_depth) *
>                                        (sizeof(mfxSyncPoint) + sizeof(QSVFrame*)));
>          if (!q->async_fifo)
>              return AVERROR(ENOMEM);

I doubt those changes are OK.

> diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
> index 81b8f6f..041f298 100644
> --- a/libavcodec/qsvenc.c
> +++ b/libavcodec/qsvenc.c
> @@ -760,6 +760,8 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
>  
>              q->param.ExtParam[q->param.NumExtParam++] = q->extparam_internal[i];
>          }
> +
> +        qsv->nb_encoder_surfaces = q->req.NumFrameSuggested + q->async_depth;
>      } else {
>          q->param.ExtParam    = q->extparam_internal;
>          q->param.NumExtParam = q->nb_extparam_internal;


Patch is way too big, and does too many unrelated things.

With my kindest regards,
Hendrik Leppkes Aug. 25, 2016, 10:24 a.m. UTC | #2
On Thu, Aug 25, 2016 at 2:07 AM, Nablet Developer <sdk@nablet.com> wrote:
> From: ChaoX A Liu <chaox.a.liu@intel.com>
>
> Signed-off-by: ChaoX A Liu <chaox.a.liu@intel.com>
> ---
>  ffmpeg.c                  |   2 +-
>  ffmpeg.h                  |   2 +
>  ffmpeg_opt.c              |   2 +-
>  ffmpeg_qsv.c              | 636 +++++++++++++++++++++++++++++++++++++++++++++-
>  libavcodec/qsv.h          |   3 +
>  libavcodec/qsv_internal.h |   2 +
>  libavcodec/qsvdec.c       |   5 +-
>  libavcodec/qsvenc.c       |   2 +
>  8 files changed, 649 insertions(+), 5 deletions(-)
>

This is a giant patch that doesnt even begin to describe what it does.
So, whats it good for? We can already do transcoding of video from QSV
decoder to QSV encoder all in GPU memory without 600+ lines of new
code. Admittedly it currently has a few issues, but those could be
fixed, but why do we need 600 new lines of code?

- Hendrik
Nablet Developer Sept. 14, 2016, 5:33 p.m. UTC | #3
>>  ffmpeg_qsv.c              | 636 +++++++++++++++++++++++++++++++++++++++++++++-
>>  libavcodec/qsv.h          |   3 +
>>  libavcodec/qsv_internal.h |   2 +
>>  libavcodec/qsvdec.c       |   5 +-
>>  libavcodec/qsvenc.c       |   2 +
>>  8 files changed, 649 insertions(+), 5 deletions(-)
>>
>
> This is a giant patch that doesnt even begin to describe what it does.
> So, whats it good for? We can already do transcoding of video from QSV
> decoder to QSV encoder all in GPU memory without 600+ lines of new
> code. Admittedly it currently has a few issues, but those could be
> fixed, but why do we need 600 new lines of code?

1.      In GPU level, all frames are processed in tiled mode (we called 
video memory mode) which cannot be read/write by cpu directly. The frame 
buffer should be allocated via vaCreateSurface. Any non-tiled memory 
must be copied to tiled memory when using GPU acceleration. The copying 
task is done by MediaSDK internally.

2.      In current implementation, frame buffer is allocated by ffmpeg 
in linear mode (we called system memory) ; QSV deocder’s output and QSV 
encoder’s input are all set to video memory mode ( e.g. iopattern  = 
MFX_IOPATTERN_OUT_SYSTEM_MEMORY in qsv decoder); so there are 2 memory 
copy processes in mediaSDK: one is copying from video_memory to system 
memory when output from HW decoder, another is copying from system 
memory to video memory when feeding to HW encoder. It will decrease 
transcoding performance greatly, especially for  high resolution such as 
1080 & 4K.

3.      The patches are avoiding such additional memory copy when all 
modules in transcoding pipeline can be accelerated by GPU. To achieving 
it, iopattern must be set to video_memory, and an external allocator 
must be implemented as mediaSDK requirements, and set it to QSV codec. 
Most of the 600 lines in the patches are the code to implement the 
external allocator. At the same time, the patches also add some code to 
checking whether all modules in transcoding pipeline can be accelerated 
by GPU or not, so that transcoder can select video-memory or 
system-memory automatically.

4.      As our test, the transcoding performance can be improved about 
20% or more according to resolution with patches. And it can reach the 
performance which is declared in QSV specification.
diff mbox

Patch

diff --git a/ffmpeg.c b/ffmpeg.c
index bad311d..0bab9e9 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -3050,7 +3050,7 @@  static int transcode_init(void)
             set_encoder_id(output_files[ost->file_index], ost);
 
 #if CONFIG_LIBMFX
-            if (qsv_transcode_init(ost))
+            if (qsv_transcode_init_vidmem(ost))
                 exit_program(1);
 #endif
 
diff --git a/ffmpeg.h b/ffmpeg.h
index 49d65d8..2633336 100644
--- a/ffmpeg.h
+++ b/ffmpeg.h
@@ -585,6 +585,8 @@  int vda_init(AVCodecContext *s);
 int videotoolbox_init(AVCodecContext *s);
 int qsv_init(AVCodecContext *s);
 int qsv_transcode_init(OutputStream *ost);
+int qsv_init_vidmem(AVCodecContext *s);
+int qsv_transcode_init_vidmem(OutputStream *ost);
 int vaapi_decode_init(AVCodecContext *avctx);
 int vaapi_device_init(const char *device);
 int cuvid_init(AVCodecContext *s);
diff --git a/ffmpeg_opt.c b/ffmpeg_opt.c
index 2ea09cf..b5e4483 100644
--- a/ffmpeg_opt.c
+++ b/ffmpeg_opt.c
@@ -79,7 +79,7 @@  const HWAccel hwaccels[] = {
     { "videotoolbox",   videotoolbox_init,   HWACCEL_VIDEOTOOLBOX,   AV_PIX_FMT_VIDEOTOOLBOX },
 #endif
 #if CONFIG_LIBMFX
-    { "qsv",   qsv_init,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
+    { "qsv",   qsv_init_vidmem,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
 #endif
 #if CONFIG_VAAPI
     { "vaapi", vaapi_decode_init, HWACCEL_VAAPI, AV_PIX_FMT_VAAPI },
diff --git a/ffmpeg_qsv.c b/ffmpeg_qsv.c
index acc54dd..43402d6 100644
--- a/ffmpeg_qsv.c
+++ b/ffmpeg_qsv.c
@@ -18,11 +18,15 @@ 
 
 #include <mfx/mfxvideo.h>
 #include <stdlib.h>
+#include <stdbool.h>
+#include <va/va.h>
 
 #include "libavutil/dict.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
+#include "libavutil/avstring.h"
 #include "libavcodec/qsv.h"
+#include "libavcodec/qsv_internal.h"
 
 #include "ffmpeg.h"
 
@@ -34,6 +38,8 @@  typedef struct QSVContext {
     mfxExtOpaqueSurfaceAlloc opaque_alloc;
     AVBufferRef             *opaque_surfaces_buf;
 
+    mfxFrameAllocator frame_allocator;
+
     uint8_t           *surface_used;
     mfxFrameSurface1 **surface_ptrs;
     int nb_surfaces;
@@ -60,7 +66,7 @@  static int qsv_get_buffer(AVCodecContext *s, AVFrame *frame, int flags)
                                          buffer_release, &qsv->surface_used[i], 0);
         if (!frame->buf[0])
             return AVERROR(ENOMEM);
-        frame->data[3]       = (uint8_t*)qsv->surface_ptrs[i];
+        frame->data[3]       = frame->buf[0]->data;
         qsv->surface_used[i] = 1;
         return 0;
     }
@@ -265,3 +271,631 @@  fail:
     av_freep(&qsv);
     return AVERROR_UNKNOWN;
 }
+
+enum {
+    MFX_FOURCC_VP8_NV12    = MFX_MAKEFOURCC('V','P','8','N'),
+    MFX_FOURCC_VP8_MBDATA  = MFX_MAKEFOURCC('V','P','8','M'),
+    MFX_FOURCC_VP8_SEGMAP  = MFX_MAKEFOURCC('V','P','8','S'),
+};
+
+typedef struct vaapiMemId
+{
+    VASurfaceID* m_surface;
+    VAImage m_image;
+    unsigned int m_fourcc;
+    mfxU8* m_sys_buffer;
+    mfxU8* m_va_buffer;
+} vaapiMemId;
+
+static QSVSession g_session;
+
+/* ****************************************************************************** *\
+
+INTEL CORPORATION PROPRIETARY INFORMATION
+This software is supplied under the terms of a license agreement or nondisclosure
+agreement with Intel Corporation and may not be copied or disclosed except in
+accordance with the terms of that agreement
+Copyright(c) 2011-2014 Intel Corporation. All Rights Reserved.
+
+\* ****************************************************************************** */
+static mfxStatus va_to_mfx_status(VAStatus va_res)
+{
+    mfxStatus mfxRes = MFX_ERR_NONE;
+
+    switch (va_res) {
+        case VA_STATUS_SUCCESS:
+            mfxRes = MFX_ERR_NONE;
+            break;
+        case VA_STATUS_ERROR_ALLOCATION_FAILED:
+            mfxRes = MFX_ERR_MEMORY_ALLOC;
+            break;
+        case VA_STATUS_ERROR_ATTR_NOT_SUPPORTED:
+        case VA_STATUS_ERROR_UNSUPPORTED_PROFILE:
+        case VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT:
+        case VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT:
+        case VA_STATUS_ERROR_UNSUPPORTED_BUFFERTYPE:
+        case VA_STATUS_ERROR_FLAG_NOT_SUPPORTED:
+        case VA_STATUS_ERROR_RESOLUTION_NOT_SUPPORTED:
+            mfxRes = MFX_ERR_UNSUPPORTED;
+            break;
+        case VA_STATUS_ERROR_INVALID_DISPLAY:
+        case VA_STATUS_ERROR_INVALID_CONFIG:
+        case VA_STATUS_ERROR_INVALID_CONTEXT:
+        case VA_STATUS_ERROR_INVALID_SURFACE:
+        case VA_STATUS_ERROR_INVALID_BUFFER:
+        case VA_STATUS_ERROR_INVALID_IMAGE:
+        case VA_STATUS_ERROR_INVALID_SUBPICTURE:
+            mfxRes = MFX_ERR_NOT_INITIALIZED;
+            break;
+        case VA_STATUS_ERROR_INVALID_PARAMETER:
+            mfxRes = MFX_ERR_INVALID_VIDEO_PARAM;
+        default:
+            mfxRes = MFX_ERR_UNKNOWN;
+            break;
+    }
+
+    return mfxRes;
+}
+
+static unsigned int ConvertMfxFourccToVAFormat(mfxU32 fourcc)
+{
+    switch (fourcc) {
+        case MFX_FOURCC_NV12:
+            return VA_FOURCC_NV12;
+        case MFX_FOURCC_YUY2:
+            return VA_FOURCC_YUY2;
+        case MFX_FOURCC_YV12:
+            return VA_FOURCC_YV12;
+        case MFX_FOURCC_RGB4:
+            return VA_FOURCC_ARGB;
+        case MFX_FOURCC_P8:
+            return VA_FOURCC_P208;
+
+        default:
+            return 0;
+    }
+}
+
+static unsigned int ConvertVP8FourccToMfxFourcc(mfxU32 fourcc)
+{
+    switch (fourcc) {
+        case MFX_FOURCC_VP8_NV12:
+        case MFX_FOURCC_VP8_MBDATA:
+            return MFX_FOURCC_NV12;
+        case MFX_FOURCC_VP8_SEGMAP:
+            return MFX_FOURCC_P8;
+
+        default:
+            return fourcc;
+    }
+}
+
+static mfxStatus frame_alloc(mfxHDL pthis, mfxFrameAllocRequest *request, mfxFrameAllocResponse *response)
+{
+    int i, format;
+    VAStatus va_res = VA_STATUS_SUCCESS;
+    VASurfaceID* surfaces = NULL;
+    vaapiMemId* vaapi_mid = NULL;
+    mfxFrameSurface1 *mfxsurface = NULL;
+    VAContextID context_id;
+    VABufferType codedbuf_type;
+    mfxStatus mfx_res = MFX_ERR_NONE;
+    mfxMemId* mids = NULL;
+    VASurfaceAttrib attrib;
+    mfxU16 surface_num;
+    unsigned int va_fourcc = 0;
+    mfxU32 fourcc = request->Info.FourCC;
+    QSVContext *q = pthis;
+    AVQSVContext *qsv = q->ost->enc_ctx->hwaccel_context;
+    mfxU16 numAllocated = 0;
+    bool bCreateSrfSucceeded = false;
+    mfxU32 mfx_fourcc;
+    int codedbuf_size;
+    int width32;
+    int height32;
+    void *avctx = NULL;
+
+    av_log(avctx, AV_LOG_INFO, "=========vaapi alloc frame==============\n");
+    if (!request || !response || !request->NumFrameSuggested)
+        return MFX_ERR_MEMORY_ALLOC;
+
+    memset(response, 0, sizeof(*response));
+    surface_num = request->NumFrameSuggested;
+    if ((request->Type & MFX_MEMTYPE_EXTERNAL_FRAME) &&
+            (request->Type & MFX_MEMTYPE_FROM_DECODE))
+        surface_num += (qsv->nb_encoder_surfaces + qsv->nb_decoder_surfaces);
+
+    av_log(avctx, AV_LOG_INFO, "VAAPI: va_dpy =%p, surface_num=%d, width=%d, height=%d\n",
+            g_session.va_display, surface_num, request->Info.Width, request->Info.Height);
+    av_log(avctx, AV_LOG_INFO, "VAAPI: request->Type=%x\n",request->Type);
+
+    surfaces = (VASurfaceID*)av_calloc(surface_num, sizeof(VASurfaceID));
+    mids = (mfxMemId*)av_calloc(surface_num, sizeof(mfxMemId));
+    if (!surfaces || !mids) {
+        av_log(avctx, AV_LOG_ERROR, "ERROR: memory allocation failed\n");
+        return MFX_ERR_MEMORY_ALLOC;
+    }
+
+    mfx_fourcc = ConvertVP8FourccToMfxFourcc(fourcc);
+    va_fourcc  = ConvertMfxFourccToVAFormat(mfx_fourcc);
+    if (va_fourcc != VA_FOURCC_P208) {
+        av_log(avctx, AV_LOG_INFO, "VAAPI: va_fourcc != VA_FOURCC_P208\n");
+        attrib.type  = VASurfaceAttribPixelFormat;
+        attrib.flags = VA_SURFACE_ATTRIB_SETTABLE;
+        attrib.value.type = VAGenericValueTypeInteger;
+        attrib.value.value.i = va_fourcc;
+        format = va_fourcc;
+
+        if (fourcc == MFX_FOURCC_VP8_NV12) {
+            // special configuration for NV12 surf allocation for VP8 hybrid encoder is required
+            attrib.type          = (VASurfaceAttribType)VASurfaceAttribUsageHint;
+            attrib.value.value.i = VA_SURFACE_ATTRIB_USAGE_HINT_ENCODER;
+        } else if (fourcc == MFX_FOURCC_VP8_MBDATA) {
+            // special configuration for MB data surf allocation for VP8 hybrid encoder is required
+            attrib.value.value.i = VA_FOURCC_P208;
+            format               = VA_FOURCC_P208;
+        } else if (va_fourcc == VA_FOURCC_NV12) {
+            format = VA_RT_FORMAT_YUV420;
+        }
+
+        va_res = vaCreateSurfaces(g_session.va_display,
+                                  format,
+                                  request->Info.Width, request->Info.Height,
+                                  surfaces,
+                                  surface_num,
+                                  &attrib, 1);
+        bCreateSrfSucceeded = (va_res==VA_STATUS_SUCCESS);
+    } else {
+        av_log(avctx, AV_LOG_INFO, "VAAPI: va_fourcc == VA_FOURCC_P208\n");
+        context_id = request->reserved[0];
+        width32    = 32 * ((request->Info.Width + 31) >> 5);
+        height32   = 32 * ((request->Info.Height + 31) >> 5);
+
+        if (fourcc == MFX_FOURCC_VP8_SEGMAP) {
+            codedbuf_size = request->Info.Width * request->Info.Height;
+            codedbuf_type = (VABufferType)VAEncMacroblockMapBufferType;
+        } else {
+            codedbuf_size = ((width32 * height32) * 400LL / (16 * 16));
+            codedbuf_type = VAEncCodedBufferType;
+        }
+
+        for (numAllocated = 0; numAllocated < surface_num; numAllocated++) {
+            VABufferID coded_buf;
+            va_res = vaCreateBuffer(g_session.va_display,
+                                    context_id,
+                                    codedbuf_type,
+                                    codedbuf_size,
+                                    1,
+                                    NULL,
+                                    &coded_buf);
+            mfx_res = va_to_mfx_status(va_res);
+            if (MFX_ERR_NONE != mfx_res)
+                break;
+
+            surfaces[numAllocated] = coded_buf;
+        }
+    }
+
+    if (va_res == VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_INFO, "VAAPI: %d VA surfaces have been allocated\n", surface_num);
+        for (i=0; i<surface_num; i++) {
+            vaapi_mid = av_mallocz(sizeof(*vaapi_mid));
+            vaapi_mid->m_fourcc  = fourcc;
+            vaapi_mid->m_surface = surfaces + i;
+            mids[i] = vaapi_mid;
+        }
+
+        if ((request->Type & MFX_MEMTYPE_EXTERNAL_FRAME) &&
+            (request->Type & MFX_MEMTYPE_FROM_DECODE)) {
+            q->surface_ptrs = av_realloc(q->surface_ptrs,
+                    sizeof(*q->surface_ptrs) * (q->nb_surfaces + surface_num));
+            q->surface_used = av_realloc(q->surface_used,
+                    sizeof(*q->surface_used) * (q->nb_surfaces + surface_num));
+
+            for (i=0; i<surface_num; i++,q->nb_surfaces++) {
+                mfxsurface = av_mallocz(sizeof(*mfxsurface));
+                memcpy(&mfxsurface->Info, &request->Info, sizeof(mfxFrameInfo));
+                mfxsurface->Data.MemId = mids[i];
+                q->surface_ptrs[q->nb_surfaces] = mfxsurface;
+                q->surface_used[q->nb_surfaces] = 0;
+            }
+        }
+        response->mids = mids;
+        response->NumFrameActual = surface_num;
+    } else {
+        response->mids = NULL;
+        response->NumFrameActual = 0;
+
+        if (VA_FOURCC_P208 != va_fourcc || fourcc==MFX_FOURCC_VP8_MBDATA) {
+            if (bCreateSrfSucceeded)
+                vaDestroySurfaces(g_session.va_display, surfaces, surface_num);
+        } else {
+            for (i=0; i<numAllocated; i++)
+                vaDestroyBuffer(g_session.va_display, surfaces[i]);
+        }
+
+        for (i=0; i<surface_num; i++) {
+            vaapi_mid = mids[i];
+            av_freep(&vaapi_mid);
+        }
+        av_freep(&mids);
+        av_freep(&surfaces);
+
+        av_log(avctx, AV_LOG_INFO, "ERROR: VA Surfaces allocation failed\n");
+
+        return MFX_ERR_MEMORY_ALLOC;
+    }
+
+    return MFX_ERR_NONE;
+}
+
+static mfxStatus frame_free(mfxHDL pthis, mfxFrameAllocResponse *response)
+{
+    vaapiMemId   *vaapi_mid = NULL;
+    VASurfaceID  *surfaces = NULL;
+    mfxU32        i = 0;
+    bool          isBitstreamMemory = false;
+    mfxU32        mfx_fourcc;
+
+    if (!response)
+        return MFX_ERR_NULL_PTR;
+
+    av_log( NULL, AV_LOG_INFO, "=========vaapi free frame: %d==============\n", response->NumFrameActual);
+    if (response->mids) {
+        surfaces   = ((vaapiMemId*)response->mids[0])->m_surface;
+        mfx_fourcc = ConvertVP8FourccToMfxFourcc(((vaapiMemId*)response->mids[0])->m_fourcc);
+        isBitstreamMemory = (MFX_FOURCC_P8==mfx_fourcc)?true:false;
+
+        for (i = 0; i < response->NumFrameActual; ++i) {
+            vaapi_mid = (vaapiMemId*)response->mids[i];
+            if (MFX_FOURCC_P8 == mfx_fourcc) {
+                vaDestroyBuffer(g_session.va_display, surfaces[i]);
+            } else if (vaapi_mid->m_sys_buffer) {
+                free(vaapi_mid->m_sys_buffer);
+            }
+            av_freep(&vaapi_mid);
+        }
+
+        if (!isBitstreamMemory)
+            vaDestroySurfaces(g_session.va_display, surfaces, response->NumFrameActual);
+
+        av_freep(&response->mids);
+        av_freep(&surfaces);
+    }
+
+    response->NumFrameActual = 0;
+
+    return MFX_ERR_NONE;
+}
+
+static mfxStatus frame_lock(mfxHDL pthis, mfxMemId mid, mfxFrameData *ptr)
+{
+    mfxStatus             mfx_res = MFX_ERR_NONE;
+    VAStatus              va_res  = VA_STATUS_SUCCESS;
+    VACodedBufferSegment *coded_buffer_segment;
+    vaapiMemId           *vaapi_mid = (vaapiMemId*)mid;
+    mfxU8                *pBuffer = 0;
+    mfxU32                mfx_fourcc;
+
+    if (!mid)
+        return MFX_ERR_INVALID_HANDLE;
+
+    mfx_fourcc = ConvertVP8FourccToMfxFourcc(vaapi_mid->m_fourcc);
+    if (MFX_FOURCC_P8 == mfx_fourcc) {
+        if (vaapi_mid->m_fourcc == MFX_FOURCC_VP8_SEGMAP) {
+            va_res =  vaMapBuffer(g_session.va_display, *(vaapi_mid->m_surface), (void **)(&pBuffer));
+        } else {
+            va_res =  vaMapBuffer(g_session.va_display, *(vaapi_mid->m_surface), (void **)(&coded_buffer_segment));
+        }
+        mfx_res = va_to_mfx_status(va_res);
+        if (MFX_ERR_NONE == mfx_res) {
+            if (vaapi_mid->m_fourcc == MFX_FOURCC_VP8_SEGMAP) {
+                ptr->Y = pBuffer;
+            } else {
+                ptr->Y = (mfxU8*)coded_buffer_segment->buf;
+            }
+        }
+    } else {
+        va_res = vaSyncSurface(g_session.va_display, *(vaapi_mid->m_surface));
+        mfx_res = va_to_mfx_status(va_res);
+        if (MFX_ERR_NONE == mfx_res) {
+            va_res = vaDeriveImage(g_session.va_display, *(vaapi_mid->m_surface), &(vaapi_mid->m_image));
+            mfx_res = va_to_mfx_status(va_res);
+        }
+
+        if (MFX_ERR_NONE == mfx_res) {
+            va_res = vaMapBuffer(g_session.va_display, vaapi_mid->m_image.buf, (void **) &pBuffer);
+            mfx_res = va_to_mfx_status(va_res);
+        }
+
+        if (MFX_ERR_NONE == mfx_res) {
+            switch (vaapi_mid->m_image.format.fourcc) {
+                case VA_FOURCC_NV12:
+                    if (mfx_fourcc == MFX_FOURCC_NV12) {
+                        ptr->Pitch = (mfxU16)vaapi_mid->m_image.pitches[0];
+                        ptr->Y = pBuffer + vaapi_mid->m_image.offsets[0];
+                        ptr->U = pBuffer + vaapi_mid->m_image.offsets[1];
+                        ptr->V = ptr->U + 1;
+                    } else {
+                        mfx_res = MFX_ERR_LOCK_MEMORY;
+                    }
+                    break;
+
+                case VA_FOURCC_YV12:
+                    if (mfx_fourcc == MFX_FOURCC_YV12) {
+                        ptr->Pitch = (mfxU16)vaapi_mid->m_image.pitches[0];
+                        ptr->Y = pBuffer + vaapi_mid->m_image.offsets[0];
+                        ptr->V = pBuffer + vaapi_mid->m_image.offsets[1];
+                        ptr->U = pBuffer + vaapi_mid->m_image.offsets[2];
+                    } else {
+                        mfx_res = MFX_ERR_LOCK_MEMORY;
+                    }
+                    break;
+
+                case VA_FOURCC_YUY2:
+                    if (mfx_fourcc == MFX_FOURCC_YUY2) {
+                        ptr->Pitch = (mfxU16)vaapi_mid->m_image.pitches[0];
+                        ptr->Y = pBuffer + vaapi_mid->m_image.offsets[0];
+                        ptr->U = ptr->Y + 1;
+                        ptr->V = ptr->Y + 3;
+                    } else {
+                        mfx_res = MFX_ERR_LOCK_MEMORY;
+                    }
+                    break;
+
+                case VA_FOURCC_ARGB:
+                    if (mfx_fourcc == MFX_FOURCC_RGB4) {
+                        ptr->Pitch = (mfxU16)vaapi_mid->m_image.pitches[0];
+                        ptr->B = pBuffer + vaapi_mid->m_image.offsets[0];
+                        ptr->G = ptr->B + 1;
+                        ptr->R = ptr->B + 2;
+                        ptr->A = ptr->B + 3;
+                    } else {
+                        mfx_res = MFX_ERR_LOCK_MEMORY;
+                    }
+                    break;
+
+                case VA_FOURCC_P208:
+                    if (mfx_fourcc == MFX_FOURCC_NV12) {
+                        ptr->Pitch = (mfxU16)vaapi_mid->m_image.pitches[0];
+                        ptr->Y = pBuffer + vaapi_mid->m_image.offsets[0];
+                    } else {
+                        mfx_res = MFX_ERR_LOCK_MEMORY;
+                    }
+                    break;
+
+                default:
+                    mfx_res = MFX_ERR_LOCK_MEMORY;
+                    break;
+            }
+        }
+    }
+    return mfx_res;
+}
+
+static mfxStatus frame_unlock(mfxHDL pthis, mfxMemId mid, mfxFrameData *ptr)
+{
+    mfxU32        mfx_fourcc;
+    vaapiMemId   *vaapi_mid = (vaapiMemId*)mid;
+
+    if (!vaapi_mid || !vaapi_mid->m_surface)
+        return MFX_ERR_INVALID_HANDLE;
+
+    mfx_fourcc = ConvertVP8FourccToMfxFourcc(vaapi_mid->m_fourcc);
+    if (mfx_fourcc == MFX_FOURCC_P8) {
+        vaUnmapBuffer(g_session.va_display, *(vaapi_mid->m_surface));
+    } else {
+        vaUnmapBuffer(g_session.va_display, vaapi_mid->m_image.buf);
+        vaDestroyImage(g_session.va_display, vaapi_mid->m_image.image_id);
+
+        if (ptr) {
+            ptr->Pitch = 0;
+            ptr->Y     = NULL;
+            ptr->U     = NULL;
+            ptr->V     = NULL;
+            ptr->A     = NULL;
+        }
+    }
+
+    return MFX_ERR_NONE;
+}
+
+static mfxStatus frame_get_hdl(mfxHDL pthis, mfxMemId mid, mfxHDL *handle)
+{
+    vaapiMemId* vaapi_mid = (vaapiMemId*)mid;
+
+    if (!handle || !mid)
+        return MFX_ERR_INVALID_HANDLE;
+
+    *handle = (mfxHDL) vaapi_mid->m_surface;
+
+    return MFX_ERR_NONE;
+}
+
+static int qsv_check_filters(const OutputStream *ost)
+{
+    AVFilterGraph *graph = NULL;
+    char args[512];
+    AVDictionaryEntry *e = NULL;
+    AVFilterInOut *inputs, *outputs;
+    int ret = 0;
+    int i;
+    const char *filter_list = "buffer|buffersink|null|format|setpts";
+
+    if (!ost->avfilter)
+        return -1;
+
+    graph = avfilter_graph_alloc();
+    if (!graph)
+        return AVERROR(ENOMEM);
+
+    args[0] = 0;
+    while ((e = av_dict_get(ost->sws_dict, "", e, AV_DICT_IGNORE_SUFFIX)))
+        av_strlcatf(args, sizeof(args), "%s=%s:", e->key, e->value);
+    if (strlen(args))
+        args[strlen(args)-1] = 0;
+    graph->scale_sws_opts = av_strdup(args);
+
+    if ((ret = avfilter_graph_parse2(graph, ost->avfilter, &inputs, &outputs)) < 0)
+        goto out;
+
+    av_log(NULL, AV_LOG_DEBUG, "total filters %d\n", graph->nb_filters);
+    for (i = 0; i < graph->nb_filters; i++) {
+        av_log(NULL, AV_LOG_DEBUG, "\tfilter name: %s \n",
+                graph->filters[i]->filter->name);
+        if (!av_match_list(graph->filters[i]->filter->name, filter_list, '|')) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+out:
+    avfilter_inout_free(&inputs);
+    avfilter_inout_free(&outputs);
+    avfilter_graph_free(&graph);
+    return ret;
+}
+
+static void qsv_uninit_vidmem(AVCodecContext *s)
+{
+    int i;
+
+    InputStream *ist = s->opaque;
+    QSVContext  *qsv = ist->hwaccel_ctx;
+
+    ff_qsv_close_internal_session(&g_session);
+
+    av_freep(&qsv->ost->enc_ctx->hwaccel_context);
+    av_freep(&s->hwaccel_context);
+
+    for (i = 0; i < qsv->nb_surfaces; i++)
+        av_freep(&qsv->surface_ptrs[i]);
+    av_freep(&qsv->surface_used);
+    av_freep(&qsv->surface_ptrs);
+
+    av_freep(&qsv);
+}
+
+int qsv_init_vidmem(AVCodecContext *s)
+{
+    InputStream  *ist = s->opaque;
+    QSVContext   *qsv = ist->hwaccel_ctx;
+    AVQSVContext *hwctx_dec;
+
+    if (!qsv) {
+        av_log(NULL, AV_LOG_ERROR, "QSV transcoding is not initialized. "
+               "-hwaccel qsv should only be used for one-to-one QSV transcoding "
+               "with no filters.\n");
+        return AVERROR_BUG;
+    }
+
+    hwctx_dec = av_qsv_alloc_context();
+    if (!hwctx_dec)
+        return AVERROR(ENOMEM);
+
+    hwctx_dec->session        = qsv->session;
+    hwctx_dec->iopattern      = MFX_IOPATTERN_OUT_VIDEO_MEMORY;
+    hwctx_dec->ext_buffers    = NULL;
+    hwctx_dec->nb_ext_buffers = 0;
+
+    av_freep(&s->hwaccel_context);
+    s->hwaccel_context        = hwctx_dec;
+
+    ist->hwaccel_get_buffer   = qsv_get_buffer;
+    ist->hwaccel_uninit       = qsv_uninit_vidmem;
+
+    return 0;
+}
+
+int qsv_transcode_init_vidmem(OutputStream *ost)
+{
+    InputStream *ist = NULL;
+    const enum AVPixelFormat *pix_fmt;
+
+    AVDictionaryEntry *e;
+    const AVOption *opt;
+    int flags = 0;
+
+    int err, i;
+
+    QSVContext *qsv = NULL;
+    AVQSVContext *enc_hwctx = NULL;
+
+    /* check if the encoder supports QSV */
+    if (!ost->enc->pix_fmts)
+        return 0;
+    for (pix_fmt = ost->enc->pix_fmts; *pix_fmt != AV_PIX_FMT_NONE; pix_fmt++)
+        if (*pix_fmt == AV_PIX_FMT_QSV)
+            break;
+    if (*pix_fmt == AV_PIX_FMT_NONE)
+        return 0;
+
+    /*Check if the filters support QSV*/
+    if (ost->source_index < 0 || qsv_check_filters(ost) < 0)
+        return 0;
+
+    /* check if the decoder supports QSV and the output only goes to this stream */
+    ist = input_streams[ost->source_index];
+    if (ist->nb_filters || ist->hwaccel_id != HWACCEL_QSV ||
+        !ist->dec || !ist->dec->pix_fmts)
+        return 0;
+    for (pix_fmt = ist->dec->pix_fmts; *pix_fmt != AV_PIX_FMT_NONE; pix_fmt++)
+        if (*pix_fmt == AV_PIX_FMT_QSV)
+            break;
+    if (*pix_fmt == AV_PIX_FMT_NONE)
+        return 0;
+
+    for (i = 0; i < nb_output_streams; i++)
+        if (output_streams[i] != ost &&
+            output_streams[i]->source_index == ost->source_index)
+            return 0;
+
+    av_log(NULL, AV_LOG_VERBOSE, "Setting up QSV transcoding\n");
+
+    qsv   = av_mallocz(sizeof(*qsv));
+    enc_hwctx = av_qsv_alloc_context();
+    if (!qsv || !enc_hwctx)
+        goto fail;
+
+    err = ff_qsv_init_internal_session(NULL, &g_session);
+    if (err != MFX_ERR_NONE) {
+        av_log(NULL, AV_LOG_ERROR, "Error initializing an MFX session: %d\n", err);
+        goto fail;
+    }
+
+    qsv->ost = ost;
+    qsv->session = g_session.session;
+    qsv->frame_allocator.Alloc = frame_alloc;
+    qsv->frame_allocator.Free  = frame_free;
+    qsv->frame_allocator.Lock  = frame_lock;
+    qsv->frame_allocator.Unlock= frame_unlock;
+    qsv->frame_allocator.GetHDL= frame_get_hdl;
+    qsv->frame_allocator.pthis = qsv;
+
+    err = MFXVideoCORE_SetFrameAllocator(qsv->session, &qsv->frame_allocator);
+    if (MFX_ERR_NONE != err) {
+        av_log(NULL, AV_LOG_ERROR, "Error MFXVideoCORE_SetFrameAllocator: %d\n", err);
+        goto fail;
+    }
+
+    e = av_dict_get(ost->encoder_opts, "flags", NULL, 0);
+    opt = av_opt_find(ost->enc_ctx, "flags", NULL, 0, 0);
+    if (e && opt)
+        av_opt_eval_flags(ost->enc_ctx, opt, e->value, &flags);
+
+    enc_hwctx->session               = qsv->session;
+    enc_hwctx->iopattern             = MFX_IOPATTERN_IN_VIDEO_MEMORY;
+    ost->hwaccel_ctx                 = qsv;
+    ost->enc_ctx->hwaccel_context    = enc_hwctx;
+
+    ist->resample_pix_fmt            = AV_PIX_FMT_QSV;
+    ist->hwaccel_ctx                 = qsv;
+
+    return 0;
+
+fail:
+    av_freep(&enc_hwctx);
+    av_freep(&qsv);
+
+    return AVERROR_UNKNOWN;
+}
diff --git a/libavcodec/qsv.h b/libavcodec/qsv.h
index b77158e..ee968d0 100644
--- a/libavcodec/qsv.h
+++ b/libavcodec/qsv.h
@@ -95,6 +95,9 @@  typedef struct AVQSVContext {
      * the opaque allocation request.
      */
     int opaque_alloc_type;
+
+    int nb_decoder_surfaces;
+    int nb_encoder_surfaces;
 } AVQSVContext;
 
 /**
diff --git a/libavcodec/qsv_internal.h b/libavcodec/qsv_internal.h
index e43728b..58589df 100644
--- a/libavcodec/qsv_internal.h
+++ b/libavcodec/qsv_internal.h
@@ -21,6 +21,8 @@ 
 #ifndef AVCODEC_QSV_INTERNAL_H
 #define AVCODEC_QSV_INTERNAL_H
 
+#include "config.h"
+
 #if CONFIG_VAAPI
 #define AVCODEC_QSV_LINUX_SESSION_HANDLE
 #endif //CONFIG_VAAPI
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index b9de0af..47dd818 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -72,6 +72,7 @@  static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt
         q->iopattern      = qsv->iopattern;
         q->ext_buffers    = qsv->ext_buffers;
         q->nb_ext_buffers = qsv->nb_ext_buffers;
+        qsv->nb_decoder_surfaces = q->async_depth;
     }
     if (!q->session) {
         if (!q->internal_qs.session) {
@@ -88,7 +89,7 @@  static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "Failed to load plugins %s, ret = %s\n",
                     q->load_plugins, av_err2str(ret));
-            return ff_qsv_error(ret);
+            return ret;
         }
     }
 
@@ -149,7 +150,7 @@  static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt
        So weare  pre-allocating fifo big enough for 17 elements:
      */
     if (!q->async_fifo) {
-        q->async_fifo = av_fifo_alloc((1 + 16) *
+        q->async_fifo = av_fifo_alloc((1 + 16 + q->async_depth) *
                                       (sizeof(mfxSyncPoint) + sizeof(QSVFrame*)));
         if (!q->async_fifo)
             return AVERROR(ENOMEM);
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index 81b8f6f..041f298 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -760,6 +760,8 @@  int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
 
             q->param.ExtParam[q->param.NumExtParam++] = q->extparam_internal[i];
         }
+
+        qsv->nb_encoder_surfaces = q->req.NumFrameSuggested + q->async_depth;
     } else {
         q->param.ExtParam    = q->extparam_internal;
         q->param.NumExtParam = q->nb_extparam_internal;