[FFmpeg-devel,2/6] avcodec/nvdec: avoid needless copy of output frame

Submitted by Timo Rothenpieler on May 8, 2018, 1:31 p.m.

Details

Message ID 20180508133132.28940-2-timo@rothenpieler.org
State New
Headers show

Commit Message

Timo Rothenpieler May 8, 2018, 1:31 p.m.
Replaces the data pointers with the mapped cuvid ones.
Adds buffer_refs to the frame to ensure the needed contexts stay alive
and the cuvid idx stays allocated.
Adds another buffer_ref to unmap the frame when it's unreferenced itself.
---
 libavcodec/nvdec.c | 83 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 23 deletions(-)

Comments

wm4 May 8, 2018, 3:25 p.m.
On Tue,  8 May 2018 15:31:28 +0200
Timo Rothenpieler <timo@rothenpieler.org> wrote:

> Replaces the data pointers with the mapped cuvid ones.
> Adds buffer_refs to the frame to ensure the needed contexts stay alive
> and the cuvid idx stays allocated.
> Adds another buffer_ref to unmap the frame when it's unreferenced itself.
> ---
>  libavcodec/nvdec.c | 83 +++++++++++++++++++++++++++++++++-------------
>  1 file changed, 60 insertions(+), 23 deletions(-)
> 
> diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c
> index ab3cb88b27..d98f9dd95e 100644
> --- a/libavcodec/nvdec.c
> +++ b/libavcodec/nvdec.c
> @@ -308,7 +308,7 @@ int ff_nvdec_decode_init(AVCodecContext *avctx)
>      params.CodecType           = cuvid_codec_type;
>      params.ChromaFormat        = cuvid_chroma_format;
>      params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size;
> -    params.ulNumOutputSurfaces = 1;
> +    params.ulNumOutputSurfaces = frames_ctx->initial_pool_size;
>  
>      ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, &params, avctx);
>      if (ret < 0) {
> @@ -354,6 +354,32 @@ static void nvdec_fdd_priv_free(void *priv)
>      av_freep(&priv);
>  }
>  
> +static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data)
> +{
> +    NVDECFrame *unmap_data = (NVDECFrame*)data;
> +    NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data;
> +    CUdeviceptr devptr = (CUdeviceptr)opaque;
> +    CUresult err;
> +    CUcontext dummy;
> +
> +    err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx);
> +    if (err != CUDA_SUCCESS) {
> +        av_log(NULL, AV_LOG_ERROR, "cuCtxPushCurrent failed\n");
> +        goto finish;
> +    }
> +
> +    err = decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
> +    if (err != CUDA_SUCCESS)
> +        av_log(NULL, AV_LOG_ERROR, "cuvidUnmapVideoFrame failed\n");
> +
> +    decoder->cudl->cuCtxPopCurrent(&dummy);
> +
> +finish:
> +    av_buffer_unref(&unmap_data->idx_ref);
> +    av_buffer_unref(&unmap_data->decoder_ref);
> +    av_free(unmap_data);
> +}
> +
>  static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
>  {
>      FrameDecodeData  *fdd = (FrameDecodeData*)frame->private_ref->data;
> @@ -361,6 +387,7 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
>      NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data;
>  
>      CUVIDPROCPARAMS vpp = { .progressive_frame = 1 };
> +    NVDECFrame *unmap_data = NULL;
>  
>      CUresult err;
>      CUcontext dummy;
> @@ -383,32 +410,39 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
>          goto finish;
>      }
>  
> -    for (i = 0; frame->data[i]; i++) {
> -        CUDA_MEMCPY2D cpy = {
> -            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
> -            .dstMemoryType = CU_MEMORYTYPE_DEVICE,
> -            .srcDevice     = devptr,
> -            .dstDevice     = (CUdeviceptr)frame->data[i],
> -            .srcPitch      = pitch,
> -            .dstPitch      = frame->linesize[i],
> -            .srcY          = offset,
> -            .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
> -            .Height        = frame->height >> (i ? 1 : 0),
> -        };
> -
> -        err = decoder->cudl->cuMemcpy2D(&cpy);
> -        if (err != CUDA_SUCCESS) {
> -            av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n",
> -                   err);
> -            ret = AVERROR_UNKNOWN;
> -            goto copy_fail;
> -        }
> +    unmap_data = av_mallocz(sizeof(*unmap_data));
> +    if (!unmap_data) {
> +        ret = AVERROR(ENOMEM);
> +        goto copy_fail;
> +    }
>  
> -        offset += cpy.Height;
> +    frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data),
> +                                     nvdec_unmap_mapped_frame, (void*)devptr,
> +                                     AV_BUFFER_FLAG_READONLY);
> +    if (!frame->buf[1]) {
> +        ret = AVERROR(ENOMEM);
> +        goto copy_fail;
>      }
>  
> +    unmap_data->idx = cf->idx;
> +    unmap_data->idx_ref = av_buffer_ref(cf->idx_ref);
> +    unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref);
> +
> +    for (i = 0; frame->linesize[i]; i++) {
> +        frame->data[i] = (uint8_t*)(devptr + offset);
> +        frame->linesize[i] = pitch;
> +        offset += pitch * (frame->height >> (i ? 1 : 0));
> +    }
> +
> +    goto finish;
> +
>  copy_fail:
> -    decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
> +    if (!frame->buf[1]) {
> +        decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
> +        av_freep(&unmap_data);
> +    } else {
> +        av_buffer_unref(&frame->buf[1]);
> +    }
>  
>  finish:
>      decoder->cudl->cuCtxPopCurrent(&dummy);
> @@ -526,6 +560,7 @@ int ff_nvdec_frame_params(AVCodecContext *avctx,
>                            int dpb_size)
>  {
>      AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
> +    AVCUDAFramesContext *hwctx = (AVCUDAFramesContext*)frames_ctx->hwctx;
>      const AVPixFmtDescriptor *sw_desc;
>      int cuvid_codec_type, cuvid_chroma_format;
>  
> @@ -550,6 +585,8 @@ int ff_nvdec_frame_params(AVCodecContext *avctx,
>      frames_ctx->height            = (avctx->coded_height + 1) & ~1;
>      frames_ctx->initial_pool_size = dpb_size;
>  
> +    hwctx->flags = AV_CUDA_HWFRAMES_DUMMY_MODE;
> +
>      switch (sw_desc->comp[0].depth) {
>      case 8:
>          frames_ctx->sw_format = AV_PIX_FMT_NV12;

Probably breaks if the API user doesn't use this dummy flag.
Timo Rothenpieler May 8, 2018, 3:49 p.m.
Am 08.05.2018 um 17:25 schrieb wm4:
> On Tue,  8 May 2018 15:31:28 +0200
> Timo Rothenpieler <timo@rothenpieler.org> wrote:
> 
>> Replaces the data pointers with the mapped cuvid ones.
>> Adds buffer_refs to the frame to ensure the needed contexts stay alive
>> and the cuvid idx stays allocated.
>> Adds another buffer_ref to unmap the frame when it's unreferenced itself.
>> ---
>>   libavcodec/nvdec.c | 83 +++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 60 insertions(+), 23 deletions(-)
>>
>> diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c
>> index ab3cb88b27..d98f9dd95e 100644
>> --- a/libavcodec/nvdec.c
>> +++ b/libavcodec/nvdec.c
>> @@ -308,7 +308,7 @@ int ff_nvdec_decode_init(AVCodecContext *avctx)
>>       params.CodecType           = cuvid_codec_type;
>>       params.ChromaFormat        = cuvid_chroma_format;
>>       params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size;
>> -    params.ulNumOutputSurfaces = 1;
>> +    params.ulNumOutputSurfaces = frames_ctx->initial_pool_size;
>>   
>>       ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, &params, avctx);
>>       if (ret < 0) {
>> @@ -354,6 +354,32 @@ static void nvdec_fdd_priv_free(void *priv)
>>       av_freep(&priv);
>>   }
>>   
>> +static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data)
>> +{
>> +    NVDECFrame *unmap_data = (NVDECFrame*)data;
>> +    NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data;
>> +    CUdeviceptr devptr = (CUdeviceptr)opaque;
>> +    CUresult err;
>> +    CUcontext dummy;
>> +
>> +    err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx);
>> +    if (err != CUDA_SUCCESS) {
>> +        av_log(NULL, AV_LOG_ERROR, "cuCtxPushCurrent failed\n");
>> +        goto finish;
>> +    }
>> +
>> +    err = decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
>> +    if (err != CUDA_SUCCESS)
>> +        av_log(NULL, AV_LOG_ERROR, "cuvidUnmapVideoFrame failed\n");
>> +
>> +    decoder->cudl->cuCtxPopCurrent(&dummy);
>> +
>> +finish:
>> +    av_buffer_unref(&unmap_data->idx_ref);
>> +    av_buffer_unref(&unmap_data->decoder_ref);
>> +    av_free(unmap_data);
>> +}
>> +
>>   static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
>>   {
>>       FrameDecodeData  *fdd = (FrameDecodeData*)frame->private_ref->data;
>> @@ -361,6 +387,7 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
>>       NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data;
>>   
>>       CUVIDPROCPARAMS vpp = { .progressive_frame = 1 };
>> +    NVDECFrame *unmap_data = NULL;
>>   
>>       CUresult err;
>>       CUcontext dummy;
>> @@ -383,32 +410,39 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
>>           goto finish;
>>       }
>>   
>> -    for (i = 0; frame->data[i]; i++) {
>> -        CUDA_MEMCPY2D cpy = {
>> -            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
>> -            .dstMemoryType = CU_MEMORYTYPE_DEVICE,
>> -            .srcDevice     = devptr,
>> -            .dstDevice     = (CUdeviceptr)frame->data[i],
>> -            .srcPitch      = pitch,
>> -            .dstPitch      = frame->linesize[i],
>> -            .srcY          = offset,
>> -            .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
>> -            .Height        = frame->height >> (i ? 1 : 0),
>> -        };
>> -
>> -        err = decoder->cudl->cuMemcpy2D(&cpy);
>> -        if (err != CUDA_SUCCESS) {
>> -            av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n",
>> -                   err);
>> -            ret = AVERROR_UNKNOWN;
>> -            goto copy_fail;
>> -        }
>> +    unmap_data = av_mallocz(sizeof(*unmap_data));
>> +    if (!unmap_data) {
>> +        ret = AVERROR(ENOMEM);
>> +        goto copy_fail;
>> +    }
>>   
>> -        offset += cpy.Height;
>> +    frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data),
>> +                                     nvdec_unmap_mapped_frame, (void*)devptr,
>> +                                     AV_BUFFER_FLAG_READONLY);
>> +    if (!frame->buf[1]) {
>> +        ret = AVERROR(ENOMEM);
>> +        goto copy_fail;
>>       }
>>   
>> +    unmap_data->idx = cf->idx;
>> +    unmap_data->idx_ref = av_buffer_ref(cf->idx_ref);
>> +    unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref);
>> +
>> +    for (i = 0; frame->linesize[i]; i++) {
>> +        frame->data[i] = (uint8_t*)(devptr + offset);
>> +        frame->linesize[i] = pitch;
>> +        offset += pitch * (frame->height >> (i ? 1 : 0));
>> +    }
>> +
>> +    goto finish;
>> +
>>   copy_fail:
>> -    decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
>> +    if (!frame->buf[1]) {
>> +        decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
>> +        av_freep(&unmap_data);
>> +    } else {
>> +        av_buffer_unref(&frame->buf[1]);
>> +    }
>>   
>>   finish:
>>       decoder->cudl->cuCtxPopCurrent(&dummy);
>> @@ -526,6 +560,7 @@ int ff_nvdec_frame_params(AVCodecContext *avctx,
>>                             int dpb_size)
>>   {
>>       AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
>> +    AVCUDAFramesContext *hwctx = (AVCUDAFramesContext*)frames_ctx->hwctx;
>>       const AVPixFmtDescriptor *sw_desc;
>>       int cuvid_codec_type, cuvid_chroma_format;
>>   
>> @@ -550,6 +585,8 @@ int ff_nvdec_frame_params(AVCodecContext *avctx,
>>       frames_ctx->height            = (avctx->coded_height + 1) & ~1;
>>       frames_ctx->initial_pool_size = dpb_size;
>>   
>> +    hwctx->flags = AV_CUDA_HWFRAMES_DUMMY_MODE;
>> +
>>       switch (sw_desc->comp[0].depth) {
>>       case 8:
>>           frames_ctx->sw_format = AV_PIX_FMT_NV12;
> 
> Probably breaks if the API user doesn't use this dummy flag.

It intentionally stores the other buffer_ref in buf[1] so it doesn't 
leak that memory if the context is supplied externally.
It'll just be pointlessly wasting memory.
Timo Rothenpieler May 9, 2018, 10:51 p.m.
applied
follow-up CUstream patches also applied
Wang Bin May 10, 2018, 2:51 a.m.
>
>
>
> -        offset += cpy.Height;
> +    frame->buf[1] = av_buffer_create((uint8_t *)unmap_data,
> sizeof(*unmap_data),
> +                                     nvdec_unmap_mapped_frame,
> (void*)devptr,
> +                                     AV_BUFFER_FLAG_READONLY);
> +    if (!frame->buf[1]) {
> +        ret = AVERROR(ENOMEM);
> +        goto copy_fail;
>      }
>
>
If AVFrame.buf[i] is non-NULL, then buf[j] must be non-NULL for all j < i,
see libavutil/frame.h. So either change the comment in frame.h or change
your implementation is required
Timo Rothenpieler May 10, 2018, 9:39 a.m.
Am 10.05.2018 um 04:51 schrieb Wang Bin:
>>
>>
>>
>> -        offset += cpy.Height;
>> +    frame->buf[1] = av_buffer_create((uint8_t *)unmap_data,
>> sizeof(*unmap_data),
>> +                                     nvdec_unmap_mapped_frame,
>> (void*)devptr,
>> +                                     AV_BUFFER_FLAG_READONLY);
>> +    if (!frame->buf[1]) {
>> +        ret = AVERROR(ENOMEM);
>> +        goto copy_fail;
>>       }
>>
>>
> If AVFrame.buf[i] is non-NULL, then buf[j] must be non-NULL for all j < i,
> see libavutil/frame.h. So either change the comment in frame.h or change
> your implementation is required

buf[0] is filled by the hw_frames_ctx, with a dummy buffer, but it is 
filled.
So after this buf[0] and buf[1] are filled, so I don't see the problem?
Oscar Amoros Huguet May 10, 2018, 5:15 p.m.
Just want to update,

We compiled master branch with this patches, tested and looked at NSIGHT.

Everything correct: 
- The internal NVDEC kernel is using the stream we have set in AVCUDADeviceContext struct.
- It slightly overlaps with other kernels, removing wait times between kernel executions.
- Nice that the Device to Device copies disappeared.

We are updating our ffmpeg compilation to this master branch commit.

It was a pleasure to collaborate.

Oscar

-----Original Message-----
From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Timo Rothenpieler

Sent: Thursday, May 10, 2018 12:52 AM
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH 2/6] avcodec/nvdec: avoid needless copy of output frame

applied
follow-up CUstream patches also applied

Patch hide | download patch | download mbox

diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c
index ab3cb88b27..d98f9dd95e 100644
--- a/libavcodec/nvdec.c
+++ b/libavcodec/nvdec.c
@@ -308,7 +308,7 @@  int ff_nvdec_decode_init(AVCodecContext *avctx)
     params.CodecType           = cuvid_codec_type;
     params.ChromaFormat        = cuvid_chroma_format;
     params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size;
-    params.ulNumOutputSurfaces = 1;
+    params.ulNumOutputSurfaces = frames_ctx->initial_pool_size;
 
     ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, &params, avctx);
     if (ret < 0) {
@@ -354,6 +354,32 @@  static void nvdec_fdd_priv_free(void *priv)
     av_freep(&priv);
 }
 
+static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data)
+{
+    NVDECFrame *unmap_data = (NVDECFrame*)data;
+    NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data;
+    CUdeviceptr devptr = (CUdeviceptr)opaque;
+    CUresult err;
+    CUcontext dummy;
+
+    err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx);
+    if (err != CUDA_SUCCESS) {
+        av_log(NULL, AV_LOG_ERROR, "cuCtxPushCurrent failed\n");
+        goto finish;
+    }
+
+    err = decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
+    if (err != CUDA_SUCCESS)
+        av_log(NULL, AV_LOG_ERROR, "cuvidUnmapVideoFrame failed\n");
+
+    decoder->cudl->cuCtxPopCurrent(&dummy);
+
+finish:
+    av_buffer_unref(&unmap_data->idx_ref);
+    av_buffer_unref(&unmap_data->decoder_ref);
+    av_free(unmap_data);
+}
+
 static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
 {
     FrameDecodeData  *fdd = (FrameDecodeData*)frame->private_ref->data;
@@ -361,6 +387,7 @@  static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
     NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data;
 
     CUVIDPROCPARAMS vpp = { .progressive_frame = 1 };
+    NVDECFrame *unmap_data = NULL;
 
     CUresult err;
     CUcontext dummy;
@@ -383,32 +410,39 @@  static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
         goto finish;
     }
 
-    for (i = 0; frame->data[i]; i++) {
-        CUDA_MEMCPY2D cpy = {
-            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
-            .dstMemoryType = CU_MEMORYTYPE_DEVICE,
-            .srcDevice     = devptr,
-            .dstDevice     = (CUdeviceptr)frame->data[i],
-            .srcPitch      = pitch,
-            .dstPitch      = frame->linesize[i],
-            .srcY          = offset,
-            .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
-            .Height        = frame->height >> (i ? 1 : 0),
-        };
-
-        err = decoder->cudl->cuMemcpy2D(&cpy);
-        if (err != CUDA_SUCCESS) {
-            av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n",
-                   err);
-            ret = AVERROR_UNKNOWN;
-            goto copy_fail;
-        }
+    unmap_data = av_mallocz(sizeof(*unmap_data));
+    if (!unmap_data) {
+        ret = AVERROR(ENOMEM);
+        goto copy_fail;
+    }
 
-        offset += cpy.Height;
+    frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data),
+                                     nvdec_unmap_mapped_frame, (void*)devptr,
+                                     AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[1]) {
+        ret = AVERROR(ENOMEM);
+        goto copy_fail;
     }
 
+    unmap_data->idx = cf->idx;
+    unmap_data->idx_ref = av_buffer_ref(cf->idx_ref);
+    unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref);
+
+    for (i = 0; frame->linesize[i]; i++) {
+        frame->data[i] = (uint8_t*)(devptr + offset);
+        frame->linesize[i] = pitch;
+        offset += pitch * (frame->height >> (i ? 1 : 0));
+    }
+
+    goto finish;
+
 copy_fail:
-    decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
+    if (!frame->buf[1]) {
+        decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr);
+        av_freep(&unmap_data);
+    } else {
+        av_buffer_unref(&frame->buf[1]);
+    }
 
 finish:
     decoder->cudl->cuCtxPopCurrent(&dummy);
@@ -526,6 +560,7 @@  int ff_nvdec_frame_params(AVCodecContext *avctx,
                           int dpb_size)
 {
     AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
+    AVCUDAFramesContext *hwctx = (AVCUDAFramesContext*)frames_ctx->hwctx;
     const AVPixFmtDescriptor *sw_desc;
     int cuvid_codec_type, cuvid_chroma_format;
 
@@ -550,6 +585,8 @@  int ff_nvdec_frame_params(AVCodecContext *avctx,
     frames_ctx->height            = (avctx->coded_height + 1) & ~1;
     frames_ctx->initial_pool_size = dpb_size;
 
+    hwctx->flags = AV_CUDA_HWFRAMES_DUMMY_MODE;
+
     switch (sw_desc->comp[0].depth) {
     case 8:
         frames_ctx->sw_format = AV_PIX_FMT_NV12;