[FFmpeg-devel,2/2] avfilter/dnn_processing: Add TensorRT backend

Message ID	20210725115843.8235-2-xiaoweiw@nvidia.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Received-SPF: Pass (protection.outlook.com: domain of nvidia.com designates 216.228.112.35 as permitted sender) receiver=protection.outlook.com; client-ip=216.228.112.35; helo=mail.nvidia.com; From: Xiaowei Wang <xiaoweiw@nvidia.com> To: <ffmpeg-devel@ffmpeg.org> Date: Sun, 25 Jul 2021 19:58:43 +0800 Message-ID: <20210725115843.8235-2-xiaoweiw@nvidia.com> In-Reply-To: <20210725115843.8235-1-xiaoweiw@nvidia.com> References: <20210725115843.8235-1-xiaoweiw@nvidia.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/2] avfilter/dnn_processing: Add TensorRT backend Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Xiaowei Wang <xiaoweiw@nvidia.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,1/2] avfilter/dnn/dnn_backend_trt: Update with master and sign-off \| expand [FFmpeg-devel,1/2] avfilter/dnn/dnn_backend_trt: Update with master and sign-off [FFmpeg-devel,2/2] avfilter/dnn_processing: Add TensorRT backend

Context	Check	Description
andriy/x86_make	fail	Make failed
andriy/PPC64_make	warning	Make failed

diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile index f9ea7ca386..4661d3b2cb 100644 --- a/libavfilter/dnn/Makefile +++ b/libavfilter/dnn/Makefile @@ -16,6 +16,6 @@ OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_mat DNN-OBJS-$(CONFIG_LIBTENSORFLOW) += dnn/dnn_backend_tf.o DNN-OBJS-$(CONFIG_LIBOPENVINO) += dnn/dnn_backend_openvino.o -DNN-OBJS-$(CONFIG_LIBTENSORRT) += dnn/dnn_backend_tensorrt.o dnn/trt_class_wrapper.o dnn/dnn_io_proc_trt.ptx.o +DNN-OBJS-$(CONFIG_LIBTENSORRT) += dnn/dnn_backend_tensorrt.o OBJS-$(CONFIG_DNN) += $(DNN-OBJS-yes) diff --git a/libavfilter/dnn/dnn_backend_tensorrt.c b/libavfilter/dnn/dnn_backend_tensorrt.c index b45b770a77..e50ebc6c99 100644 --- a/libavfilter/dnn/dnn_backend_tensorrt.c +++ b/libavfilter/dnn/dnn_backend_tensorrt.c @@ -25,45 +25,119 @@ * DNN TensorRT backend implementation. */ -#include "trt_class_wrapper.h" #include "dnn_backend_tensorrt.h" -#include "libavutil/mem.h" #include "libavformat/avio.h" +#include "libavutil/mem.h" #include "libavutil/avassert.h" #include "libavutil/opt.h" #include "libavutil/avstring.h" +#include "libavutil/buffer.h" +#include "libavutil/pixfmt.h" +#include "libavutil/pixdesc.h" + #include "dnn_io_proc.h" #include "../internal.h" -#include "libavutil/buffer.h" +#include "trt_class_wrapper.h" + +#include <stdio.h> +#include <dlfcn.h> +#include <libavutil/log.h> #include <stdint.h> #define OFFSET(x) offsetof(TRTContext, x) #define FLAGS AV_OPT_FLAG_FILTERING_PARAM static const AVOption dnn_tensorrt_options[] = { - { "device", "index of the GPU to run model", OFFSET(options.device), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS }, + { "device", "index of the GPU to run model", OFFSET(options.device), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS }, + { "plugin", "path to the plugin so", OFFSET(options.plugin_so), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { NULL } }; AVFILTER_DEFINE_CLASS(dnn_tensorrt); -DNNModel *ff_dnn_load_model_trt(const char *model_filename,DNNFunctionType func_type, +static TRTWrapper *wrapper = NULL; + +static int load_trt_backend_lib(TRTWrapper *w, const char *so_path, int mode) +{ + w->so_handle = dlopen("libnvtensorrt.so", mode); + if (!w->so_handle) + { + return AVERROR(EIO); + } + + w->load_model_func = (tloadModelTrt*)dlsym(w->so_handle, "load_model_trt"); + w->execute_model_func = (texecuteModelTrt*)dlsym(w->so_handle, "execute_model_trt"); + w->free_model_func = (tfreeModelTrt*)dlsym(w->so_handle, "free_model_trt"); + if (!w->load_model_func || !w->execute_model_func || !w->free_model_func) + { + return AVERROR(EIO); + } + + return 0; +} + +DNNModel *ff_dnn_load_model_trt(const char *model_filename,DNNFunctionType func_type, const char *options, AVFilterContext *filter_ctx) { + char id_buf[64]; + AVBufferRef *device_ref = NULL; + TRTContext *ctx = (TRTContext*)av_mallocz(sizeof(TRTContext)); + + int ret = 0; + DNNModel *model = NULL; model = (DNNModel*)av_mallocz(sizeof(DNNModel)); if (!model){ return NULL; } + wrapper = av_mallocz(sizeof(TRTWrapper)); + wrapper->ctx = ctx; + if (load_trt_backend_lib(wrapper, "libnvtensorrt.so", RTLD_LAZY) != 0) + { + av_log(ctx, AV_LOG_ERROR, "Cannot load wrapper functions. Please check if libnvtensorrt.so is installed\n"); + return NULL; + } + ctx->av_class = &dnn_tensorrt_class; + av_opt_set_defaults(ctx); + if (av_opt_set_from_string(ctx, options, NULL, "=", "&") < 0) + { + av_log(ctx, AV_LOG_ERROR, "Failed to parse options \"%s\"\n", options); + return NULL; + } + snprintf(id_buf, sizeof(id_buf), "%d", ctx->options.device); + + if (ctx->options.plugin_so) + { + if (dlopen(ctx->options.plugin_so, RTLD_LAZY)) + { + av_log(ctx, AV_LOG_INFO, "Loaded plugin library\n"); + } + else + { + av_log(ctx, AV_LOG_ERROR, "Error loading plugin library\n"); + return NULL; + } + } - trt_load_model(model, model_filename, &dnn_tensorrt_class, options); + av_log(ctx, AV_LOG_INFO, "Load trt engine\n"); + + ret = wrapper->load_model_func(model, ctx, model_filename); + + ctx->hwdevice = device_ref; + model->options = options; return model; } DNNReturnType ff_dnn_execute_model_trt(const DNNModel *model, DNNExecBaseParams *exec_params) { - execute_model_trt(model, exec_params->input_name, exec_params->in_frame, - exec_params->output_names, exec_params->nb_output, exec_params->out_frame); + AVFrame *in_frame = exec_params->in_frame; + AVFrame *out_frame = exec_params->out_frame; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get((enum AVPixelFormat)in_frame->format); + int packed = (desc->flags & AV_PIX_FMT_FLAG_PLANAR) ? 0 : 1; + + wrapper->execute_model_func(model, in_frame->data, in_frame->linesize, in_frame->width, in_frame->height, + out_frame->data, out_frame->linesize, out_frame->width, out_frame->height, + packed, 0); return DNN_SUCCESS; } @@ -71,7 +145,10 @@ void ff_dnn_free_model_trt(DNNModel **model) { if (*model) { - free_model_trt(*model); - av_freep(model); + wrapper->free_model_func(*model); + dlclose(wrapper->so_handle); + + av_freep(&wrapper->ctx); + av_freep(&wrapper); } } \ No newline at end of file diff --git a/libavfilter/dnn/dnn_backend_tensorrt.h b/libavfilter/dnn/dnn_backend_tensorrt.h index d700cb247f..02e26ce032 100644 --- a/libavfilter/dnn/dnn_backend_tensorrt.h +++ b/libavfilter/dnn/dnn_backend_tensorrt.h @@ -37,28 +37,30 @@ extern "C" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" - typedef struct TRTOptions{ - int device; - } TRTOptions; + // typedef struct TRTOptions{ + // int device; + // } TRTOptions; - typedef struct TRTContext{ - const AVClass *av_class; - TRTOptions options; - AVBufferRef *hwdevice; - // Host memory pointer to input/output image data - void *host_in, *host_out; - // Device memory pointer to the fp32 CHW input/output of the model - // The device memory is only allocated once and reused during inference - // Multiple input/output is not supported - CUdeviceptr trt_in, trt_out; - // Device memory pointer to 8-bit image data - CUdeviceptr frame_in, frame_out; + // typedef struct TRTContext{ + // const AVClass *av_class; + // TRTOptions options; + // AVBufferRef *hwdevice; + // // Host memory pointer to input/output image data + // void *host_in, *host_out; + // // Device memory pointer to the fp32 CHW input/output of the model + // // The device memory is only allocated once and reused during inference + // // Multiple input/output is not supported + // CUdeviceptr trt_in, trt_out; + // // Device memory pointer to 8-bit image data + // CUdeviceptr frame_in, frame_out; - CUmodule cu_module; - CUfunction cu_func_frame_to_dnn, cu_func_dnn_to_frame; + // CUmodule cu_module; + // CUfunction cu_func_frame_to_dnn, cu_func_dnn_to_frame; - int channels; - } TRTContext; + // CUcontext cuda_ctx; + + // int channels, packed; + // } TRTContext; DNNModel *ff_dnn_load_model_trt(const char *model_filename,DNNFunctionType func_type, const char *options, AVFilterContext *filter_ctx); diff --git a/libavfilter/dnn/dnn_io_proc_trt.cu b/libavfilter/dnn/dnn_io_proc_trt.cu deleted file mode 100644 index 030cfd2f60..0000000000 --- a/libavfilter/dnn/dnn_io_proc_trt.cu +++ /dev/null @@ -1,55 +0,0 @@ -#include <bits/stdint-uintn.h> -extern "C" { - -__global__ void frame_to_dnn(uint8_t *src, int src_linesize, float *dst, int dst_linesize, - int width, int height, int unpack_rgb) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= width || y >= height) - return; - - if (unpack_rgb) - { - uchar3 rgb = *((uchar3 *)(src + y * src_linesize) + x); - dst[y * dst_linesize + x] = (float)rgb.x; - dst[y * dst_linesize + x + dst_linesize * height] = (float)rgb.y; - dst[y * dst_linesize + x + 2 * dst_linesize * height] = (float)rgb.z; - } - else - { - dst[y * dst_linesize + x] = (float)src[y * src_linesize + x]; - } -} - -__device__ static float clamp(float x, float lower, float upper) { - return x < lower ? lower : (x > upper ? upper : x); -} - -__global__ void dnn_to_frame(float *src, int src_linesize, uint8_t *dst, int dst_linesize, - int width, int height, int pack_rgb) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x >= width || y >= height) - return; - - if (pack_rgb) - { - uint8_t r = (uint8_t)clamp(src[y * src_linesize + x], .0f, 255.0f); - uint8_t g = (uint8_t)clamp(src[y * src_linesize + x + src_linesize * height], .0f, 255.0f); - uint8_t b = (uint8_t)clamp(src[y * src_linesize + x + 2 * src_linesize * height], .0f, 255.0f); - - uchar3 rgb = make_uchar3(r, g, b); - - *((uchar3*)(dst + y * dst_linesize) + x) = rgb; - } - else - { - dst[y * dst_linesize + x] = (uint8_t)clamp(src[y * src_linesize + x], .0f, 255.0f); - } -} - -} \ No newline at end of file diff --git a/libavfilter/dnn/trt_class_wrapper.cpp b/libavfilter/dnn/trt_class_wrapper.cpp deleted file mode 100644 index dac433b690..0000000000 --- a/libavfilter/dnn/trt_class_wrapper.cpp +++ /dev/null @@ -1,731 +0,0 @@ -/* -* Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. - */ - -/** - * @file - * DNN TensorRT backend C++ wrapper. - */ - -#include "trt_class_wrapper.h" -#include "dnn_backend_tensorrt.h" - -#include <vector> -#include <map> -#include <iostream> -#include <fstream> -#include <iomanip> -#include <string> -#include <chrono> -#include <sstream> -#include <mutex> - -#ifdef __cplusplus -extern "C" -{ -#endif - - #include "libavutil/buffer.h" - #include "libavutil/hwcontext.h" - #include "libavutil/cuda_check.h" - #include "libavutil/log.h" - #include "libavutil/opt.h" - #include "libavformat/avio.h" - #include "dnn_io_proc.h" - #include "libavutil/frame.h" - #include "libavutil/pixdesc.h" - #include "libavutil/pixfmt.h" - #include "libavutil/mem.h" - -#ifdef __cplusplus -} -#endif - -#include <sys/stat.h> -#include <time.h> -#include <unistd.h> -#include <sys/socket.h> -#include <netinet/in.h> -#include <arpa/inet.h> -#define SOCKET int -#define INVALID_SOCKET -1 - -#include <cuda_runtime.h> -#include <NvInfer.h> - -using namespace nvinfer1; -using namespace std; - -#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) -#define BLOCKX 32 -#define BLOCKY 16 - -// Self-defined CUDA check functions as cuda_check.h is not available for cpp due to void* function pointers -inline bool check(CUresult e, TRTContext *ctx, CudaFunctions* cu, int iLine, const char *szFile) { - if (e != CUDA_SUCCESS) { - const char* pStr; - cu->cuGetErrorName(e, &pStr); - av_log(ctx, AV_LOG_ERROR, "CUDA driver API error: %s, at line %d in file %s\n", - pStr, iLine, szFile); - return false; - } - return true; -} - -inline bool check(cudaError_t e, TRTContext *ctx, int iLine, const char *szFile) { - if (e != cudaSuccess) { - av_log(ctx, AV_LOG_ERROR, "CUDA runtime API error: %s, at line %d in file %s\n", - cudaGetErrorName(e), iLine, szFile); - return false; - } - return true; -} - -inline bool check(bool bSuccess, TRTContext *ctx, int iLine, const char *szFile) { - if (!bSuccess) { - av_log(ctx, AV_LOG_ERROR, "Error at line %d in file %s\n", iLine, szFile); - return false; - } - return true; -} - -#define ck(call, ctx) check(call, ctx, __LINE__, __FILE__) -#define ck_cu(call) check(call, ctx, cu, __LINE__, __FILE__) - -inline std::string to_string(nvinfer1::Dims const &dim) { - std::ostringstream oss; - oss << "("; - for (int i = 0; i < dim.nbDims; i++) { - oss << dim.d[i] << ", "; - } - oss << ")"; - return oss.str(); -} - -typedef ICudaEngine *(*BuildEngineProcType)(IBuilder *builder, void *pData); - -struct IOInfo { - string name; - bool bInput; - nvinfer1::Dims dim; - nvinfer1::DataType dataType; - - string GetDimString() { - return ::to_string(dim); - } - string GetDataTypeString() { - static string aTypeName[] = {"float", "half", "int8", "int32", "bool"}; - return aTypeName[(int)dataType]; - } - size_t GetNumBytes() { - static int aSize[] = {4, 2, 1, 4, 1}; - size_t nSize = aSize[(int)dataType]; - for (int i = 0; i < dim.nbDims; i++) { - nSize *= dim.d[i]; - } - return nSize; - } - string to_string() { - ostringstream oss; - oss << setw(6) << (bInput ? "input" : "output") - << " | " << setw(5) << GetDataTypeString() - << " | " << GetDimString() - << " | " << "size=" << GetNumBytes() - << " | " << name; - return oss.str(); - } -}; - -class TrtLogger : public nvinfer1::ILogger { -public: - TrtLogger(TRTContext *ctx) : ctx(ctx) {} - void log(Severity severity, const char* msg) override { - int log_level = AV_LOG_INFO; - switch (severity){ - case nvinfer1::ILogger::Severity::kERROR: - log_level = AV_LOG_ERROR; - break; - case nvinfer1::ILogger::Severity::kWARNING: - log_level = AV_LOG_WARNING; - break; - case nvinfer1::ILogger::Severity::kINFO: - log_level = AV_LOG_INFO; - break; - case nvinfer1::ILogger::Severity::kVERBOSE: - log_level = AV_LOG_DEBUG; - break; - case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: - log_level = AV_LOG_FATAL; - break; - } - av_log(ctx, log_level, "%s\n", msg); - } -private: - TRTContext *ctx = nullptr; -}; - -class TrtLite { -public: - TrtLite(const char *szEnginePath, TRTContext *trt_ctx) : ctx(trt_ctx) { - uint8_t *pBuf = nullptr; - uint32_t nSize = 0; - - trt_logger = new TrtLogger(trt_ctx); - - read_engine(&pBuf, &nSize, szEnginePath); - IRuntime *runtime = createInferRuntime(*trt_logger); - engine = runtime->deserializeCudaEngine(pBuf, nSize); - runtime->destroy(); - if (!engine) { - av_log(ctx, AV_LOG_ERROR, "No engine generated\n"); - return; - } - av_freep(&pBuf); - } - virtual ~TrtLite() { - if (context) { - context->destroy(); - } - if (engine) { - engine->destroy(); - } - } - ICudaEngine *GetEngine() { - return engine; - } - void Execute(int nBatch, vector<void *> &vdpBuf, cudaStream_t stm = 0, cudaEvent_t* evtInputConsumed = nullptr) { - if (!engine) { - av_log(ctx, AV_LOG_ERROR, "No engine\n"); - return; - } - if (!engine->hasImplicitBatchDimension() && nBatch > 1) { - av_log(ctx, AV_LOG_ERROR, - "Engine was built with explicit batch but is executed with batch size != 1. Results may be incorrect.\n"); - return; - } - if (engine->getNbBindings() != vdpBuf.size()) { - av_log(ctx, AV_LOG_ERROR, "Number of bindings conflicts with input and output\n"); - return; - } - if (!context) { - context = engine->createExecutionContext(); - if (!context) { - av_log(ctx, AV_LOG_ERROR, "createExecutionContext() failed\n"); - return; - } - } - ck(context->enqueue(nBatch, vdpBuf.data(), stm, evtInputConsumed), ctx); - } - void Execute(map<int, Dims> i2shape, vector<void *> &vdpBuf, cudaStream_t stm = 0, cudaEvent_t* evtInputConsumed = nullptr) { - if (!engine) { - av_log(ctx, AV_LOG_ERROR, "No engine\n"); - return; - } - if (engine->hasImplicitBatchDimension()) { - av_log(ctx, AV_LOG_ERROR, "Engine was built with static-shaped input\n"); - return; - } - if (engine->getNbBindings() != vdpBuf.size()) { - av_log(ctx, AV_LOG_ERROR, "Number of bindings conflicts with input and output\n"); - return; - } - if (!context) { - context = engine->createExecutionContext(); - if (!context) { - av_log(ctx, AV_LOG_ERROR, "createExecutionContext() failed\n"); - return; - } - } - for (auto &it : i2shape) { - context->setBindingDimensions(it.first, it.second); - } - ck(context->enqueueV2(vdpBuf.data(), stm, evtInputConsumed), ctx); - } - - vector<IOInfo> ConfigIO(int nBatchSize) { - vector<IOInfo> vInfo; - if (!engine) { - av_log(ctx, AV_LOG_ERROR, "No engine\n"); - return vInfo; - } - if (!engine->hasImplicitBatchDimension()) { - av_log(ctx, AV_LOG_ERROR, "Engine must be built with implicit batch size (and static shape)\n"); - return vInfo; - } - for (int i = 0; i < engine->getNbBindings(); i++) { - vInfo.push_back({string(engine->getBindingName(i)), engine->bindingIsInput(i), - MakeDim(nBatchSize, engine->getBindingDimensions(i)), engine->getBindingDataType(i)}); - } - return vInfo; - } - vector<IOInfo> ConfigIO(map<int, Dims> i2shape) { - vector<IOInfo> vInfo; - if (!engine) { - av_log(ctx, AV_LOG_ERROR, "No engine\n"); - return vInfo; - } - if (engine->hasImplicitBatchDimension()) { - av_log(ctx, AV_LOG_ERROR, "Engine must be built with explicit batch size (to enable dynamic shape)\n"); - return vInfo; - } - if (!context) { - context = engine->createExecutionContext(); - if (!context) { - av_log(ctx, AV_LOG_ERROR, "createExecutionContext() failed\n"); - return vInfo; - } - } - for (auto &it : i2shape) { - context->setBindingDimensions(it.first, it.second); - } - if (!context->allInputDimensionsSpecified()) { - av_log(ctx, AV_LOG_ERROR, "Not all binding shape are specified\n"); - return vInfo; - } - for (int i = 0; i < engine->getNbBindings(); i++) { - vInfo.push_back({string(engine->getBindingName(i)), engine->bindingIsInput(i), - context->getBindingDimensions(i), engine->getBindingDataType(i)}); - } - return vInfo; - } - - void PrintInfo() { - if (!engine) { - av_log(ctx, AV_LOG_ERROR, "No engine\n"); - return; - } - av_log(ctx, AV_LOG_INFO, "nbBindings: %d\n", engine->getNbBindings()); - // Only contains engine-level IO information: if dynamic shape is used, - // dimension -1 will be printed - for (int i = 0; i < engine->getNbBindings(); i++) { - av_log(ctx, AV_LOG_INFO, "#%d: %s\n", i, IOInfo{string(engine->getBindingName(i)), engine->bindingIsInput(i), - engine->getBindingDimensions(i), engine->getBindingDataType(i)}.to_string().c_str()); - } - } - - TRTContext *ctx = nullptr; - -private: - void read_engine(uint8_t **engine_buf, uint32_t *engine_size, const char *engine_filename) { - AVIOContext *engine_file_ctx; - *engine_buf = nullptr; - - if (avio_open(&engine_file_ctx, engine_filename, AVIO_FLAG_READ) < 0){ - av_log(ctx, AV_LOG_ERROR, "Error reading engine file from disk!\n"); - return; - } - - uint32_t size = avio_size(engine_file_ctx); - uint8_t *buffer = (uint8_t*)av_malloc(size); - if (!buffer){ - avio_closep(&engine_file_ctx); - av_log(ctx, AV_LOG_ERROR, "Error allocating memory for TRT engine.\n"); - return; - } - uint32_t bytes_read = avio_read(engine_file_ctx, buffer, size); - avio_closep(&engine_file_ctx); - if (bytes_read != size){ - av_freep(&buffer); - av_log(ctx, AV_LOG_ERROR, "Engine file size (%d) does not equal to read size (%d)\n", size, bytes_read); - return; - } - - *engine_buf = buffer; - *engine_size = size; - - return; - } - static size_t GetBytesOfBinding(int iBinding, ICudaEngine *engine, IExecutionContext *context = nullptr) { - size_t aValueSize[] = {4, 2, 1, 4, 1}; - size_t nSize = aValueSize[(int)engine->getBindingDataType(iBinding)]; - const Dims &dims = context ? context->getBindingDimensions(iBinding) : engine->getBindingDimensions(iBinding); - for (int i = 0; i < dims.nbDims; i++) { - nSize *= dims.d[i]; - } - return nSize; - } - static nvinfer1::Dims MakeDim(int nBatchSize, nvinfer1::Dims dim) { - nvinfer1::Dims ret(dim); - for (int i = ret.nbDims; i > 0; i--) { - ret.d[i] = ret.d[i - 1]; - } - ret.d[0] = nBatchSize; - ret.nbDims++; - return ret; - } - - ICudaEngine *engine = nullptr; - IExecutionContext *context = nullptr; - TrtLogger *trt_logger = nullptr; -}; - -#define BATCH 1 - -#ifdef __cplusplus -extern "C" -{ -#endif - -static DNNReturnType frame_to_dnn(AVFrame *inframe, TRTContext *ctx, int num_bytes) -{ - AVHWDeviceContext *hw_device = (AVHWDeviceContext*)ctx->hwdevice->data; - AVCUDADeviceContext *hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - CudaFunctions *cu = hw_ctx->internal->cuda_dl; - - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get((enum AVPixelFormat)inframe->format); - int unpack = (desc->flags & AV_PIX_FMT_FLAG_PLANAR) ? 0 : 1; - void *frame_to_dnn_args[] = {&ctx->frame_in, inframe->linesize, &ctx->trt_in, &inframe->width, - &inframe->width, &inframe->height, &unpack}; - - CUDA_MEMCPY2D copy_param; - memset(&copy_param, 0, sizeof(copy_param)); - copy_param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - copy_param.dstDevice = ctx->frame_in; - copy_param.dstPitch = inframe->linesize[0]; - copy_param.srcMemoryType = CU_MEMORYTYPE_HOST; - copy_param.srcHost = inframe->data[0]; - copy_param.srcPitch = inframe->linesize[0]; - copy_param.WidthInBytes = inframe->linesize[0]; - copy_param.Height = inframe->height; - - ck_cu(cu->cuMemcpy2DAsync(&copy_param, hw_ctx->stream)); - ck_cu(cu->cuLaunchKernel(ctx->cu_func_frame_to_dnn, - DIV_UP(inframe->width, BLOCKX), DIV_UP(inframe->height, BLOCKY), - 1, BLOCKX, BLOCKY, 1, 0, hw_ctx->stream, frame_to_dnn_args, NULL)); - - return DNN_SUCCESS; -} - -static DNNReturnType dnn_to_frame(AVFrame *outframe, TRTContext *ctx, int num_bytes) -{ - AVHWDeviceContext *hw_device = (AVHWDeviceContext*)ctx->hwdevice->data; - AVCUDADeviceContext *hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - CudaFunctions *cu = hw_ctx->internal->cuda_dl; - - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get((enum AVPixelFormat)outframe->format); - int pack = (desc->flags & AV_PIX_FMT_FLAG_PLANAR) ? 0 : 1; - void *dnn_to_frame_args[] = {&ctx->trt_out, &outframe->width, &ctx->frame_out, &outframe->linesize[0], - &outframe->width, &outframe->height, &pack}; - - CUDA_MEMCPY2D copy_param; - memset(&copy_param, 0, sizeof(copy_param)); - copy_param.dstMemoryType = CU_MEMORYTYPE_HOST; - copy_param.dstHost = outframe->data[0]; - copy_param.dstPitch = outframe->linesize[0]; - copy_param.srcMemoryType = CU_MEMORYTYPE_DEVICE; - copy_param.srcDevice = ctx->frame_out; - copy_param.srcPitch = outframe->linesize[0]; - copy_param.WidthInBytes = outframe->linesize[0]; - copy_param.Height = outframe->height; - - ck_cu(cu->cuLaunchKernel(ctx->cu_func_dnn_to_frame, - DIV_UP(outframe->width, BLOCKX), DIV_UP(outframe->height, BLOCKY), - 1, BLOCKX, BLOCKY, 1, 0, hw_ctx->stream, dnn_to_frame_args, NULL)); - ck_cu(cu->cuMemcpy2DAsync(&copy_param, hw_ctx->stream)); - - ck_cu(cu->cuStreamSynchronize(hw_ctx->stream)); - - return DNN_SUCCESS; -} - -DNNReturnType trt_load_model(DNNModel *model, const char *model_filename, const AVClass *av_class, const char *options) -{ - int ret = 0; - char id_buf[64] = { 0 }; - AVBufferRef *device_ref = NULL; - TRTContext *ctx = (TRTContext*)av_mallocz(sizeof(TRTContext)); - AVHWDeviceContext *hw_device; - AVCUDADeviceContext *hw_ctx; - CudaFunctions *cu; - CUcontext dummy, cuda_ctx; - - ctx->av_class = av_class; - av_opt_set_defaults(ctx); - if (av_opt_set_from_string(ctx, options, NULL, "=", "&") < 0) - { - av_log(ctx, AV_LOG_ERROR, "Failed to parse options \"%s\"\n", options); - return DNN_ERROR; - } - snprintf(id_buf, sizeof(id_buf), "%d", ctx->options.device); - - // TODO: Add device index option - ret = av_hwdevice_ctx_create(&device_ref, AV_HWDEVICE_TYPE_CUDA, id_buf, NULL, 1); - if (ret < 0) - { - av_log(ctx, AV_LOG_ERROR, "Error creating device context\n"); - return DNN_ERROR; - } - - hw_device = (AVHWDeviceContext*)device_ref->data; - hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - cu = hw_ctx->internal->cuda_dl; - cuda_ctx = hw_ctx->cuda_ctx; - - ck_cu(cu->cuCtxPushCurrent(cuda_ctx)); - - TrtLite *trt_model= new TrtLite{model_filename, ctx}; - if (trt_model == nullptr) - { - return DNN_ERROR; - } - - ctx->hwdevice = device_ref; - - ck_cu(cu->cuCtxPopCurrent(&dummy)); - - trt_model->PrintInfo(); - - - model->model = trt_model; - model->get_input = &get_input_trt; - model->get_output = &get_output_trt; - model->options = options; - av_log(ctx, AV_LOG_INFO, "Load trt engine\n"); - - return DNN_SUCCESS; -} - -DNNReturnType get_input_trt(void *model, DNNData *input, const char *input_name) -{ - TrtLite* trt_model = (TrtLite*)model; - TRTContext *ctx = trt_model->ctx; - AVHWDeviceContext *hw_device = (AVHWDeviceContext*)ctx->hwdevice->data; - AVCUDADeviceContext *hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - CudaFunctions *cu = hw_ctx->internal->cuda_dl; - - CUcontext dummy, cuda_ctx = hw_ctx->cuda_ctx; - - av_log(ctx, AV_LOG_INFO, "Get TRT input\n"); - - // For dynamic shape, input dimensions are set to -1, - // trt input is initialized in get_output_trt() along with trt output - if (!trt_model->GetEngine()->hasImplicitBatchDimension()) - { - av_log(ctx, AV_LOG_INFO, "Model supports dynamic shape\n"); - for (int i = 0; i < trt_model->GetEngine()->getNbBindings(); i++) { - if (trt_model->GetEngine()->bindingIsInput(i)) - { - ctx->channels = trt_model->GetEngine()->getBindingDimensions(i).d[1]; - if (ctx->channels == -1) - { - av_log(ctx, AV_LOG_ERROR, "Do not support dynamic channel size\n"); - return DNN_ERROR; - } - input->channels = ctx->channels; - } - } - input->height = -1; - input->width = -1; - input->dt = DNN_FLOAT; - - return DNN_SUCCESS; - } - - vector<IOInfo> v_info = trt_model->ConfigIO(BATCH); - for (auto info: v_info) - { - if (info.bInput) - { - input->channels = info.dim.d[1]; - input->height = info.dim.d[2]; - input->width = info.dim.d[3]; - input->dt = DNN_FLOAT; - - ctx->host_in = new uint8_t[info.GetNumBytes()]; - - ck_cu(cu->cuCtxPushCurrent(cuda_ctx)); - - ck_cu(cu->cuMemAlloc(&ctx->trt_in, info.GetNumBytes())); - ck_cu(cu->cuMemAlloc(&ctx->frame_in, info.GetNumBytes() / sizeof(float))); - - ck_cu(cu->cuCtxPopCurrent(&dummy)); - - return DNN_SUCCESS; - } - } - av_log(ctx, AV_LOG_ERROR, "No input found in the model\n"); - return DNN_ERROR; -} - -DNNReturnType get_output_trt(void *model, const char *input_name, int input_width, int input_height, - const char *output_name, int *output_width, int *output_height) -{ - TrtLite* trt_model = (TrtLite*)model; - TRTContext *ctx = trt_model->ctx; - AVHWDeviceContext *hw_device = (AVHWDeviceContext*)ctx->hwdevice->data; - AVCUDADeviceContext *hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - CudaFunctions *cu = hw_ctx->internal->cuda_dl; - - CUcontext dummy, cuda_ctx = hw_ctx->cuda_ctx; - extern char dnn_io_proc_trt_ptx[]; - - av_log(ctx, AV_LOG_INFO, "Get TRT output\n"); - - vector<IOInfo> v_info; - if (!trt_model->GetEngine()->hasImplicitBatchDimension()) - { - map<int, Dims> i2shape; - i2shape.insert(make_pair(0, Dims{4, {BATCH, ctx->channels, input_height, input_width}})); - v_info = trt_model->ConfigIO(i2shape); - } - else - { - v_info = trt_model->ConfigIO(BATCH); - } - - ck_cu(cu->cuCtxPushCurrent(cuda_ctx)); - - for (auto info: v_info) - { - // For dynamic shape, inputs are initialized here - if (info.bInput && (!trt_model->GetEngine()->hasImplicitBatchDimension())) - { - ctx->host_in = new uint8_t[info.GetNumBytes()]; - ck_cu(cu->cuMemAlloc(&ctx->trt_in, info.GetNumBytes())); - ck_cu(cu->cuMemAlloc(&ctx->frame_in, info.GetNumBytes() / sizeof(float))); - } - if (!info.bInput) - { - *output_height = info.dim.d[2]; - *output_width = info.dim.d[3]; - - ctx->host_out = new uint8_t[info.GetNumBytes()]; - ck_cu(cu->cuMemAlloc(&ctx->trt_out, info.GetNumBytes())); - ck_cu(cu->cuMemAlloc(&ctx->frame_out, info.GetNumBytes() / sizeof(float))); - } - } - - ck_cu(cu->cuModuleLoadData(&ctx->cu_module, dnn_io_proc_trt_ptx)); - ck_cu(cu->cuModuleGetFunction(&ctx->cu_func_frame_to_dnn, ctx->cu_module, "frame_to_dnn")); - ck_cu(cu->cuModuleGetFunction(&ctx->cu_func_dnn_to_frame, ctx->cu_module, "dnn_to_frame")); - - ck_cu(cu->cuCtxPopCurrent(&dummy)); - - return DNN_SUCCESS; -} - -DNNReturnType execute_model_trt(const DNNModel *model, const char *input_name, AVFrame *in_frame, - const char **output_names, uint32_t nb_output, AVFrame *out_frame) -{ - TrtLite* trt_model = reinterpret_cast<TrtLite*>(model->model); - TRTContext *ctx = trt_model->ctx; - AVHWDeviceContext *hw_device = (AVHWDeviceContext*)ctx->hwdevice->data; - AVCUDADeviceContext *hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - CudaFunctions *cu = hw_ctx->internal->cuda_dl; - - CUcontext dummy, cuda_ctx = hw_ctx->cuda_ctx; - - DNNData input, output; - vector<void*> buf_vec, device_buf_vec; - int ret = 0; - - int input_height = in_frame->height; - int input_width = in_frame->width; - int input_channels = ctx->channels; - vector<IOInfo> IO_info_vec; - map<int, Dims> i2shape; - if (!trt_model->GetEngine()->hasImplicitBatchDimension()) - { - i2shape.insert(make_pair(0, Dims{4, {BATCH, input_channels, input_height, input_width}})); - IO_info_vec = trt_model->ConfigIO(i2shape); - } - else - { - IO_info_vec = trt_model->ConfigIO(BATCH); - } - - ck_cu(cu->cuCtxPushCurrent(cuda_ctx)); - - for (auto info : IO_info_vec) - { - - if (info.bInput) - { - input.height = info.dim.d[2]; - input.width = info.dim.d[3]; - input.channels = info.dim.d[1]; - input.data = ctx->host_in; - input.dt = DNN_FLOAT; - ret = frame_to_dnn(in_frame, ctx, info.GetNumBytes() / sizeof(float)); - - if (ret < 0) - return DNN_ERROR; - - device_buf_vec.push_back((void*)ctx->trt_in); - continue; - } - else - { - device_buf_vec.push_back((void*)ctx->trt_out); - } - } - - if (!trt_model->GetEngine()->hasImplicitBatchDimension()) - { - trt_model->Execute(i2shape, device_buf_vec, hw_ctx->stream); - } - else - { - trt_model->Execute(BATCH, device_buf_vec, hw_ctx->stream); - } - - for (uint32_t i = 0; i < IO_info_vec.size(); i++) - { - if (!IO_info_vec[i].bInput) - { - output.height = IO_info_vec[i].dim.d[2]; - output.width = IO_info_vec[i].dim.d[3]; - output.channels = IO_info_vec[i].dim.d[1]; - output.data = ctx->host_out; - output.dt = DNN_FLOAT; - ret = dnn_to_frame(out_frame, ctx, IO_info_vec[i].GetNumBytes() / sizeof(float)); - } - } - - ck_cu(cu->cuCtxPopCurrent(&dummy)); - - return DNN_SUCCESS; -} - -DNNReturnType free_model_trt(DNNModel *model) -{ - TrtLite* trt_model = reinterpret_cast<TrtLite*>(model->model); - TRTContext *ctx = trt_model->ctx; - AVHWDeviceContext *hw_device = (AVHWDeviceContext*)ctx->hwdevice->data; - AVCUDADeviceContext *hw_ctx = (AVCUDADeviceContext*)hw_device->hwctx; - CudaFunctions *cu = hw_ctx->internal->cuda_dl; - - delete[]((uint8_t*)ctx->host_in); - delete[]((uint8_t*)ctx->host_out); - ck_cu(cu->cuMemFree(ctx->trt_in)); - ck_cu(cu->cuMemFree(ctx->trt_out)); - - delete(trt_model); - - av_buffer_unref(&ctx->hwdevice); - av_free(ctx); - model->model = NULL; - - return DNN_SUCCESS; -} -#ifdef __cplusplus -} -#endif diff --git a/libavfilter/dnn/trt_class_wrapper.h b/libavfilter/dnn/trt_class_wrapper.h deleted file mode 100644 index 18815fadae..0000000000 --- a/libavfilter/dnn/trt_class_wrapper.h +++ /dev/null @@ -1,49 +0,0 @@ -/* -* Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. - */ - -/** - * @file - * TensorRT wrapper header for dnn_backend in ffmpeg. - */ - -#ifndef TRT_CLASS_WRAPPER_H -#define TRT_CLASS_WRAPPER_H - -#ifdef __cplusplus -extern "C" -{ -#endif - - #include "../dnn_interface.h" - - DNNReturnType free_model_trt(DNNModel *model); - DNNReturnType execute_model_trt(const DNNModel *model, const char *input_name, AVFrame *in_frame, - const char **output_names, uint32_t nb_output, AVFrame *out_frame); - DNNReturnType get_output_trt(void *model, const char *input_name, int input_width, int input_height, - const char *output_name, int *output_width, int *output_height); - DNNReturnType get_input_trt(void *model, DNNData *input, const char *input_name); - DNNReturnType trt_load_model(DNNModel *model, const char *model_filename, const AVClass *av_class, const char *options); - -#ifdef __cplusplus -} -#endif -#endif

[FFmpeg-devel,2/2] avfilter/dnn_processing: Add TensorRT backend

Checks

Commit Message

Comments

Patch