From 9e828c7cd943b964ccf4cc8d1059fcef014b24a3 Mon Sep 17 00:00:00 2001
From: Ganapathy Kasi <gkasi@nvidia.com>
Date: Mon, 12 Jun 2017 13:14:36 -0700
Subject: [PATCH] Share cuda context across multiple transcode sessions for the
same gpu
Cuda context is allocated per decode/scale/encode session. If there are multiple
transcodes in same process, many cuda contexts are allocated for the underlying
same gpu device which has a initialization perf overhead. Sharing the cuda
context per device fixes the issue. Also nvenc is directly using the cuda
interface to create the cuda context instead of using the av_hwdevice interface.
---
libavcodec/nvenc.c | 33 ++++++++++++++++++---------------
libavcodec/nvenc.h | 3 ++-
libavutil/hwcontext_cuda.c | 40 ++++++++++++++++++++++++++--------------
3 files changed, 46 insertions(+), 30 deletions(-)
@@ -326,10 +326,14 @@ static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx)
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
char name[128] = { 0};
+ char device_str[20];
int major, minor, ret;
CUresult cu_res;
CUdevice cu_device;
CUcontext dummy;
+ AVHWDeviceContext *device_ctx;
+ AVCUDADeviceContext *device_hwctx;
+
int loglevel = AV_LOG_VERBOSE;
if (ctx->device == LIST_DEVICES)
@@ -364,19 +368,19 @@ static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx)
if (ctx->device != idx && ctx->device != ANY_DEVICE)
return -1;
- cu_res = dl_fn->cuda_dl->cuCtxCreate(&ctx->cu_context_internal, 0, cu_device);
- if (cu_res != CUDA_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
+ if (ctx->device == ANY_DEVICE)
+ ctx->device = 0;
+
+ sprintf(device_str, "%d", ctx->device);
+
+ ret = av_hwdevice_ctx_create(&ctx->hwdevice, AV_HWDEVICE_TYPE_CUDA, device_str, NULL, 0);
+ if (ret < 0)
goto fail;
- }
- ctx->cu_context = ctx->cu_context_internal;
+ device_ctx = (AVHWDeviceContext *)ctx->hwdevice->data;
+ device_hwctx = device_ctx->hwctx;
- cu_res = dl_fn->cuda_dl->cuCtxPopCurrent(&dummy);
- if (cu_res != CUDA_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
- goto fail2;
- }
+ ctx->cu_context = device_hwctx->cuda_ctx;
if ((ret = nvenc_open_session(avctx)) < 0)
goto fail2;
@@ -408,8 +412,8 @@ fail3:
}
fail2:
- dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal);
- ctx->cu_context_internal = NULL;
+ av_buffer_unref(&ctx->hwdevice);
+ ctx->cu_context = NULL;
fail:
return AVERROR(ENOSYS);
@@ -1374,9 +1378,8 @@ av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
return AVERROR_EXTERNAL;
}
- if (ctx->cu_context_internal)
- dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal);
- ctx->cu_context = ctx->cu_context_internal = NULL;
+ av_buffer_unref(&ctx->hwdevice);
+ ctx->cu_context = NULL;
nvenc_free_functions(&dl_fn->nvenc_dl);
cuda_free_functions(&dl_fn->cuda_dl);
@@ -106,7 +106,6 @@ typedef struct NvencContext
NV_ENC_INITIALIZE_PARAMS init_encode_params;
NV_ENC_CONFIG encode_config;
CUcontext cu_context;
- CUcontext cu_context_internal;
int nb_surfaces;
NvencSurface *surfaces;
@@ -116,6 +115,8 @@ typedef struct NvencContext
AVFifoBuffer *output_surface_ready_queue;
AVFifoBuffer *timestamp_list;
+ AVBufferRef *hwdevice;
+
struct {
CUdeviceptr ptr;
NV_ENC_REGISTERED_PTR regptr;
@@ -24,8 +24,12 @@
#include "mem.h"
#include "pixdesc.h"
#include "pixfmt.h"
+#include <time.h>
#define CUDA_FRAME_ALIGNMENT 256
+#define NUM_DEVICES 8
+
+CUcontext cudaCtx[NUM_DEVICES] = { NULL };
typedef struct CUDAFramesContext {
int shift_width, shift_height;
@@ -363,27 +367,35 @@ static int cuda_device_create(AVHWDeviceContext *ctx, const char *device,
cu = hwctx->internal->cuda_dl;
err = cu->cuInit(0);
- if (err != CUDA_SUCCESS) {
- av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n");
- goto error;
- }
- err = cu->cuDeviceGet(&cu_device, device_idx);
if (err != CUDA_SUCCESS) {
- av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx);
- goto error;
- }
-
- err = cu->cuCtxCreate(&hwctx->cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, cu_device);
- if (err != CUDA_SUCCESS) {
- av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n");
+ av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n");
goto error;
}
- cu->cuCtxPopCurrent(&dummy);
+ if (!cudaCtx[device_idx])
+ {
+ err = cu->cuDeviceGet(&cu_device, device_idx);
+ if (err != CUDA_SUCCESS) {
+ av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx);
+ goto error;
+ }
- hwctx->internal->is_allocated = 1;
+ err = cu->cuCtxCreate(&hwctx->cuda_ctx, 0, cu_device);
+ if (err != CUDA_SUCCESS) {
+ av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n");
+ goto error;
+ }
+ cu->cuCtxPopCurrent(&dummy);
+ cudaCtx[device_idx] = hwctx->cuda_ctx;
+ hwctx->internal->is_allocated = 1;
+ }
+ else
+ {
+ hwctx->cuda_ctx = cudaCtx[device_idx];
+ hwctx->internal->is_allocated = 0;
+ }
return 0;
error:
--
2.7.4