Message ID | 20180508133132.28940-2-timo@rothenpieler.org |
---|---|
State | Superseded |
Headers | show |
On Tue, 8 May 2018 15:31:28 +0200 Timo Rothenpieler <timo@rothenpieler.org> wrote: > Replaces the data pointers with the mapped cuvid ones. > Adds buffer_refs to the frame to ensure the needed contexts stay alive > and the cuvid idx stays allocated. > Adds another buffer_ref to unmap the frame when it's unreferenced itself. > --- > libavcodec/nvdec.c | 83 +++++++++++++++++++++++++++++++++------------- > 1 file changed, 60 insertions(+), 23 deletions(-) > > diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c > index ab3cb88b27..d98f9dd95e 100644 > --- a/libavcodec/nvdec.c > +++ b/libavcodec/nvdec.c > @@ -308,7 +308,7 @@ int ff_nvdec_decode_init(AVCodecContext *avctx) > params.CodecType = cuvid_codec_type; > params.ChromaFormat = cuvid_chroma_format; > params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size; > - params.ulNumOutputSurfaces = 1; > + params.ulNumOutputSurfaces = frames_ctx->initial_pool_size; > > ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, ¶ms, avctx); > if (ret < 0) { > @@ -354,6 +354,32 @@ static void nvdec_fdd_priv_free(void *priv) > av_freep(&priv); > } > > +static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data) > +{ > + NVDECFrame *unmap_data = (NVDECFrame*)data; > + NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data; > + CUdeviceptr devptr = (CUdeviceptr)opaque; > + CUresult err; > + CUcontext dummy; > + > + err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx); > + if (err != CUDA_SUCCESS) { > + av_log(NULL, AV_LOG_ERROR, "cuCtxPushCurrent failed\n"); > + goto finish; > + } > + > + err = decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); > + if (err != CUDA_SUCCESS) > + av_log(NULL, AV_LOG_ERROR, "cuvidUnmapVideoFrame failed\n"); > + > + decoder->cudl->cuCtxPopCurrent(&dummy); > + > +finish: > + av_buffer_unref(&unmap_data->idx_ref); > + av_buffer_unref(&unmap_data->decoder_ref); > + av_free(unmap_data); > +} > + > static int nvdec_retrieve_data(void *logctx, AVFrame *frame) > { > FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; > @@ -361,6 +387,7 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame) > NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data; > > CUVIDPROCPARAMS vpp = { .progressive_frame = 1 }; > + NVDECFrame *unmap_data = NULL; > > CUresult err; > CUcontext dummy; > @@ -383,32 +410,39 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame) > goto finish; > } > > - for (i = 0; frame->data[i]; i++) { > - CUDA_MEMCPY2D cpy = { > - .srcMemoryType = CU_MEMORYTYPE_DEVICE, > - .dstMemoryType = CU_MEMORYTYPE_DEVICE, > - .srcDevice = devptr, > - .dstDevice = (CUdeviceptr)frame->data[i], > - .srcPitch = pitch, > - .dstPitch = frame->linesize[i], > - .srcY = offset, > - .WidthInBytes = FFMIN(pitch, frame->linesize[i]), > - .Height = frame->height >> (i ? 1 : 0), > - }; > - > - err = decoder->cudl->cuMemcpy2D(&cpy); > - if (err != CUDA_SUCCESS) { > - av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n", > - err); > - ret = AVERROR_UNKNOWN; > - goto copy_fail; > - } > + unmap_data = av_mallocz(sizeof(*unmap_data)); > + if (!unmap_data) { > + ret = AVERROR(ENOMEM); > + goto copy_fail; > + } > > - offset += cpy.Height; > + frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data), > + nvdec_unmap_mapped_frame, (void*)devptr, > + AV_BUFFER_FLAG_READONLY); > + if (!frame->buf[1]) { > + ret = AVERROR(ENOMEM); > + goto copy_fail; > } > > + unmap_data->idx = cf->idx; > + unmap_data->idx_ref = av_buffer_ref(cf->idx_ref); > + unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref); > + > + for (i = 0; frame->linesize[i]; i++) { > + frame->data[i] = (uint8_t*)(devptr + offset); > + frame->linesize[i] = pitch; > + offset += pitch * (frame->height >> (i ? 1 : 0)); > + } > + > + goto finish; > + > copy_fail: > - decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); > + if (!frame->buf[1]) { > + decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); > + av_freep(&unmap_data); > + } else { > + av_buffer_unref(&frame->buf[1]); > + } > > finish: > decoder->cudl->cuCtxPopCurrent(&dummy); > @@ -526,6 +560,7 @@ int ff_nvdec_frame_params(AVCodecContext *avctx, > int dpb_size) > { > AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data; > + AVCUDAFramesContext *hwctx = (AVCUDAFramesContext*)frames_ctx->hwctx; > const AVPixFmtDescriptor *sw_desc; > int cuvid_codec_type, cuvid_chroma_format; > > @@ -550,6 +585,8 @@ int ff_nvdec_frame_params(AVCodecContext *avctx, > frames_ctx->height = (avctx->coded_height + 1) & ~1; > frames_ctx->initial_pool_size = dpb_size; > > + hwctx->flags = AV_CUDA_HWFRAMES_DUMMY_MODE; > + > switch (sw_desc->comp[0].depth) { > case 8: > frames_ctx->sw_format = AV_PIX_FMT_NV12; Probably breaks if the API user doesn't use this dummy flag.
Am 08.05.2018 um 17:25 schrieb wm4: > On Tue, 8 May 2018 15:31:28 +0200 > Timo Rothenpieler <timo@rothenpieler.org> wrote: > >> Replaces the data pointers with the mapped cuvid ones. >> Adds buffer_refs to the frame to ensure the needed contexts stay alive >> and the cuvid idx stays allocated. >> Adds another buffer_ref to unmap the frame when it's unreferenced itself. >> --- >> libavcodec/nvdec.c | 83 +++++++++++++++++++++++++++++++++------------- >> 1 file changed, 60 insertions(+), 23 deletions(-) >> >> diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c >> index ab3cb88b27..d98f9dd95e 100644 >> --- a/libavcodec/nvdec.c >> +++ b/libavcodec/nvdec.c >> @@ -308,7 +308,7 @@ int ff_nvdec_decode_init(AVCodecContext *avctx) >> params.CodecType = cuvid_codec_type; >> params.ChromaFormat = cuvid_chroma_format; >> params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size; >> - params.ulNumOutputSurfaces = 1; >> + params.ulNumOutputSurfaces = frames_ctx->initial_pool_size; >> >> ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, ¶ms, avctx); >> if (ret < 0) { >> @@ -354,6 +354,32 @@ static void nvdec_fdd_priv_free(void *priv) >> av_freep(&priv); >> } >> >> +static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data) >> +{ >> + NVDECFrame *unmap_data = (NVDECFrame*)data; >> + NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data; >> + CUdeviceptr devptr = (CUdeviceptr)opaque; >> + CUresult err; >> + CUcontext dummy; >> + >> + err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx); >> + if (err != CUDA_SUCCESS) { >> + av_log(NULL, AV_LOG_ERROR, "cuCtxPushCurrent failed\n"); >> + goto finish; >> + } >> + >> + err = decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); >> + if (err != CUDA_SUCCESS) >> + av_log(NULL, AV_LOG_ERROR, "cuvidUnmapVideoFrame failed\n"); >> + >> + decoder->cudl->cuCtxPopCurrent(&dummy); >> + >> +finish: >> + av_buffer_unref(&unmap_data->idx_ref); >> + av_buffer_unref(&unmap_data->decoder_ref); >> + av_free(unmap_data); >> +} >> + >> static int nvdec_retrieve_data(void *logctx, AVFrame *frame) >> { >> FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; >> @@ -361,6 +387,7 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame) >> NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data; >> >> CUVIDPROCPARAMS vpp = { .progressive_frame = 1 }; >> + NVDECFrame *unmap_data = NULL; >> >> CUresult err; >> CUcontext dummy; >> @@ -383,32 +410,39 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame) >> goto finish; >> } >> >> - for (i = 0; frame->data[i]; i++) { >> - CUDA_MEMCPY2D cpy = { >> - .srcMemoryType = CU_MEMORYTYPE_DEVICE, >> - .dstMemoryType = CU_MEMORYTYPE_DEVICE, >> - .srcDevice = devptr, >> - .dstDevice = (CUdeviceptr)frame->data[i], >> - .srcPitch = pitch, >> - .dstPitch = frame->linesize[i], >> - .srcY = offset, >> - .WidthInBytes = FFMIN(pitch, frame->linesize[i]), >> - .Height = frame->height >> (i ? 1 : 0), >> - }; >> - >> - err = decoder->cudl->cuMemcpy2D(&cpy); >> - if (err != CUDA_SUCCESS) { >> - av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n", >> - err); >> - ret = AVERROR_UNKNOWN; >> - goto copy_fail; >> - } >> + unmap_data = av_mallocz(sizeof(*unmap_data)); >> + if (!unmap_data) { >> + ret = AVERROR(ENOMEM); >> + goto copy_fail; >> + } >> >> - offset += cpy.Height; >> + frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data), >> + nvdec_unmap_mapped_frame, (void*)devptr, >> + AV_BUFFER_FLAG_READONLY); >> + if (!frame->buf[1]) { >> + ret = AVERROR(ENOMEM); >> + goto copy_fail; >> } >> >> + unmap_data->idx = cf->idx; >> + unmap_data->idx_ref = av_buffer_ref(cf->idx_ref); >> + unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref); >> + >> + for (i = 0; frame->linesize[i]; i++) { >> + frame->data[i] = (uint8_t*)(devptr + offset); >> + frame->linesize[i] = pitch; >> + offset += pitch * (frame->height >> (i ? 1 : 0)); >> + } >> + >> + goto finish; >> + >> copy_fail: >> - decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); >> + if (!frame->buf[1]) { >> + decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); >> + av_freep(&unmap_data); >> + } else { >> + av_buffer_unref(&frame->buf[1]); >> + } >> >> finish: >> decoder->cudl->cuCtxPopCurrent(&dummy); >> @@ -526,6 +560,7 @@ int ff_nvdec_frame_params(AVCodecContext *avctx, >> int dpb_size) >> { >> AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data; >> + AVCUDAFramesContext *hwctx = (AVCUDAFramesContext*)frames_ctx->hwctx; >> const AVPixFmtDescriptor *sw_desc; >> int cuvid_codec_type, cuvid_chroma_format; >> >> @@ -550,6 +585,8 @@ int ff_nvdec_frame_params(AVCodecContext *avctx, >> frames_ctx->height = (avctx->coded_height + 1) & ~1; >> frames_ctx->initial_pool_size = dpb_size; >> >> + hwctx->flags = AV_CUDA_HWFRAMES_DUMMY_MODE; >> + >> switch (sw_desc->comp[0].depth) { >> case 8: >> frames_ctx->sw_format = AV_PIX_FMT_NV12; > > Probably breaks if the API user doesn't use this dummy flag. It intentionally stores the other buffer_ref in buf[1] so it doesn't leak that memory if the context is supplied externally. It'll just be pointlessly wasting memory.
applied follow-up CUstream patches also applied
> > > > - offset += cpy.Height; > + frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, > sizeof(*unmap_data), > + nvdec_unmap_mapped_frame, > (void*)devptr, > + AV_BUFFER_FLAG_READONLY); > + if (!frame->buf[1]) { > + ret = AVERROR(ENOMEM); > + goto copy_fail; > } > > If AVFrame.buf[i] is non-NULL, then buf[j] must be non-NULL for all j < i, see libavutil/frame.h. So either change the comment in frame.h or change your implementation is required
Am 10.05.2018 um 04:51 schrieb Wang Bin: >> >> >> >> - offset += cpy.Height; >> + frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, >> sizeof(*unmap_data), >> + nvdec_unmap_mapped_frame, >> (void*)devptr, >> + AV_BUFFER_FLAG_READONLY); >> + if (!frame->buf[1]) { >> + ret = AVERROR(ENOMEM); >> + goto copy_fail; >> } >> >> > If AVFrame.buf[i] is non-NULL, then buf[j] must be non-NULL for all j < i, > see libavutil/frame.h. So either change the comment in frame.h or change > your implementation is required buf[0] is filled by the hw_frames_ctx, with a dummy buffer, but it is filled. So after this buf[0] and buf[1] are filled, so I don't see the problem?
Just want to update,
We compiled master branch with this patches, tested and looked at NSIGHT.
Everything correct:
- The internal NVDEC kernel is using the stream we have set in AVCUDADeviceContext struct.
- It slightly overlaps with other kernels, removing wait times between kernel executions.
- Nice that the Device to Device copies disappeared.
We are updating our ffmpeg compilation to this master branch commit.
It was a pleasure to collaborate.
Oscar
-----Original Message-----
From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Timo Rothenpieler
Sent: Thursday, May 10, 2018 12:52 AM
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH 2/6] avcodec/nvdec: avoid needless copy of output frame
applied
follow-up CUstream patches also applied
diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c index ab3cb88b27..d98f9dd95e 100644 --- a/libavcodec/nvdec.c +++ b/libavcodec/nvdec.c @@ -308,7 +308,7 @@ int ff_nvdec_decode_init(AVCodecContext *avctx) params.CodecType = cuvid_codec_type; params.ChromaFormat = cuvid_chroma_format; params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size; - params.ulNumOutputSurfaces = 1; + params.ulNumOutputSurfaces = frames_ctx->initial_pool_size; ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, ¶ms, avctx); if (ret < 0) { @@ -354,6 +354,32 @@ static void nvdec_fdd_priv_free(void *priv) av_freep(&priv); } +static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data) +{ + NVDECFrame *unmap_data = (NVDECFrame*)data; + NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data; + CUdeviceptr devptr = (CUdeviceptr)opaque; + CUresult err; + CUcontext dummy; + + err = decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx); + if (err != CUDA_SUCCESS) { + av_log(NULL, AV_LOG_ERROR, "cuCtxPushCurrent failed\n"); + goto finish; + } + + err = decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); + if (err != CUDA_SUCCESS) + av_log(NULL, AV_LOG_ERROR, "cuvidUnmapVideoFrame failed\n"); + + decoder->cudl->cuCtxPopCurrent(&dummy); + +finish: + av_buffer_unref(&unmap_data->idx_ref); + av_buffer_unref(&unmap_data->decoder_ref); + av_free(unmap_data); +} + static int nvdec_retrieve_data(void *logctx, AVFrame *frame) { FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; @@ -361,6 +387,7 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame) NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data; CUVIDPROCPARAMS vpp = { .progressive_frame = 1 }; + NVDECFrame *unmap_data = NULL; CUresult err; CUcontext dummy; @@ -383,32 +410,39 @@ static int nvdec_retrieve_data(void *logctx, AVFrame *frame) goto finish; } - for (i = 0; frame->data[i]; i++) { - CUDA_MEMCPY2D cpy = { - .srcMemoryType = CU_MEMORYTYPE_DEVICE, - .dstMemoryType = CU_MEMORYTYPE_DEVICE, - .srcDevice = devptr, - .dstDevice = (CUdeviceptr)frame->data[i], - .srcPitch = pitch, - .dstPitch = frame->linesize[i], - .srcY = offset, - .WidthInBytes = FFMIN(pitch, frame->linesize[i]), - .Height = frame->height >> (i ? 1 : 0), - }; - - err = decoder->cudl->cuMemcpy2D(&cpy); - if (err != CUDA_SUCCESS) { - av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n", - err); - ret = AVERROR_UNKNOWN; - goto copy_fail; - } + unmap_data = av_mallocz(sizeof(*unmap_data)); + if (!unmap_data) { + ret = AVERROR(ENOMEM); + goto copy_fail; + } - offset += cpy.Height; + frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data), + nvdec_unmap_mapped_frame, (void*)devptr, + AV_BUFFER_FLAG_READONLY); + if (!frame->buf[1]) { + ret = AVERROR(ENOMEM); + goto copy_fail; } + unmap_data->idx = cf->idx; + unmap_data->idx_ref = av_buffer_ref(cf->idx_ref); + unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref); + + for (i = 0; frame->linesize[i]; i++) { + frame->data[i] = (uint8_t*)(devptr + offset); + frame->linesize[i] = pitch; + offset += pitch * (frame->height >> (i ? 1 : 0)); + } + + goto finish; + copy_fail: - decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); + if (!frame->buf[1]) { + decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr); + av_freep(&unmap_data); + } else { + av_buffer_unref(&frame->buf[1]); + } finish: decoder->cudl->cuCtxPopCurrent(&dummy); @@ -526,6 +560,7 @@ int ff_nvdec_frame_params(AVCodecContext *avctx, int dpb_size) { AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data; + AVCUDAFramesContext *hwctx = (AVCUDAFramesContext*)frames_ctx->hwctx; const AVPixFmtDescriptor *sw_desc; int cuvid_codec_type, cuvid_chroma_format; @@ -550,6 +585,8 @@ int ff_nvdec_frame_params(AVCodecContext *avctx, frames_ctx->height = (avctx->coded_height + 1) & ~1; frames_ctx->initial_pool_size = dpb_size; + hwctx->flags = AV_CUDA_HWFRAMES_DUMMY_MODE; + switch (sw_desc->comp[0].depth) { case 8: frames_ctx->sw_format = AV_PIX_FMT_NV12;