[FFmpeg-devel,06/16] avutil: add nvtegra hwcontext

Message ID	bb17171cf16ebcb4d15d9b998bc955754b133bdf.1717083800.git.averne381@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: averne <averne381@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Thu, 30 May 2024 21:43:08 +0200 Message-ID: <bb17171cf16ebcb4d15d9b998bc955754b133bdf.1717083800.git.averne381@gmail.com> In-Reply-To: <cover.1717083799.git.averne381@gmail.com> References: <cover.1717083799.git.averne381@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 06/16] avutil: add nvtegra hwcontext Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: averne <averne381@gmail.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	NVidia Tegra hardware decoding backend \| expand [FFmpeg-devel,00/16] NVidia Tegra hardware decoding backend [FFmpeg-devel,01/16] avutil/buffer: add helper to allocate aligned memory [FFmpeg-devel,02/16] configure, avutil: add support for HorizonOS [FFmpeg-devel,03/16] avutil: add ioctl definitions for tegra devices [FFmpeg-devel,04/16] avutil: add hardware definitions for NVDEC, NVJPG and VIC [FFmpeg-devel,05/16] avutil: add common code for nvtegra [FFmpeg-devel,06/16] avutil: add nvtegra hwcontext [FFmpeg-devel,07/16] hwcontext_nvtegra: add dynamic frequency scaling routines [FFmpeg-devel,08/16] nvtegra: add common hardware decoding code [FFmpeg-devel,09/16] nvtegra: add mpeg1/2 hardware decoding [FFmpeg-devel,10/16] nvtegra: add mpeg4 hardware decoding [FFmpeg-devel,11/16] nvtegra: add vc1 hardware decoding [FFmpeg-devel,12/16] nvtegra: add h264 hardware decoding [FFmpeg-devel,13/16] nvtegra: add hevc hardware decoding [FFmpeg-devel,14/16] nvtegra: add vp8 hardware decoding [FFmpeg-devel,15/16] nvtegra: add vp9 hardware decoding [FFmpeg-devel,16/16] nvtegra: add mjpeg hardware decoding

Context	Check	Description
yinshiyou/commit_msg_loongarch64	warning	Please wrap lines in the body of the commit message between 60 and 72 characters.
andriy/commit_msg_x86	warning	Please wrap lines in the body of the commit message between 60 and 72 characters.
andriy/make_fate_x86	success	Make fate finished
andriy/make_x86	warning	New warnings during build

diff --git a/libavutil/Makefile b/libavutil/Makefile index 733a23a8a3..44cd3f0dda 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -52,6 +52,7 @@ HEADERS = adler32.h \ hwcontext_videotoolbox.h \ hwcontext_vdpau.h \ hwcontext_vulkan.h \ + hwcontext_nvtegra.h \ nvtegra.h \ nvhost_ioctl.h \ nvmap_ioctl.h \ @@ -210,7 +211,7 @@ OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o OBJS-$(CONFIG_VULKAN) += hwcontext_vulkan.o vulkan.o OBJS-$(!CONFIG_VULKAN) += hwcontext_stub.o -OBJS-$(CONFIG_NVTEGRA) += nvtegra.o +OBJS-$(CONFIG_NVTEGRA) += nvtegra.o hwcontext_nvtegra.o OBJS += $(COMPAT_OBJS:%=../compat/%) @@ -233,7 +234,9 @@ SKIPHEADERS-$(CONFIG_VULKAN) += hwcontext_vulkan.h vulkan.h \ vulkan_functions.h \ vulkan_loader.h SKIPHEADERS-$(CONFIG_NVTEGRA) += nvtegra.h \ - nvtegra_host1x.h + nvtegra_host1x.h \ + hwcontext_nvtegra.h + TESTPROGS = adler32 \ aes \ diff --git a/libavutil/hwcontext.c b/libavutil/hwcontext.c index fa99a0d8a4..8dd05147a4 100644 --- a/libavutil/hwcontext.c +++ b/libavutil/hwcontext.c @@ -65,6 +65,9 @@ static const HWContextType * const hw_table[] = { #endif #if CONFIG_VULKAN &ff_hwcontext_type_vulkan, +#endif +#if CONFIG_NVTEGRA + &ff_hwcontext_type_nvtegra, #endif NULL, }; @@ -82,6 +85,7 @@ static const char *const hw_type_names[] = { [AV_HWDEVICE_TYPE_VIDEOTOOLBOX] = "videotoolbox", [AV_HWDEVICE_TYPE_MEDIACODEC] = "mediacodec", [AV_HWDEVICE_TYPE_VULKAN] = "vulkan", + [AV_HWDEVICE_TYPE_NVTEGRA] = "nvtegra", }; typedef struct FFHWDeviceContext { diff --git a/libavutil/hwcontext.h b/libavutil/hwcontext.h index bac30debae..d506281784 100644 --- a/libavutil/hwcontext.h +++ b/libavutil/hwcontext.h @@ -38,6 +38,7 @@ enum AVHWDeviceType { AV_HWDEVICE_TYPE_MEDIACODEC, AV_HWDEVICE_TYPE_VULKAN, AV_HWDEVICE_TYPE_D3D12VA, + AV_HWDEVICE_TYPE_NVTEGRA, }; /** diff --git a/libavutil/hwcontext_internal.h b/libavutil/hwcontext_internal.h index e32b786238..478583abdd 100644 --- a/libavutil/hwcontext_internal.h +++ b/libavutil/hwcontext_internal.h @@ -163,5 +163,6 @@ extern const HWContextType ff_hwcontext_type_vdpau; extern const HWContextType ff_hwcontext_type_videotoolbox; extern const HWContextType ff_hwcontext_type_mediacodec; extern const HWContextType ff_hwcontext_type_vulkan; +extern const HWContextType ff_hwcontext_type_nvtegra; #endif /* AVUTIL_HWCONTEXT_INTERNAL_H */ diff --git a/libavutil/hwcontext_nvtegra.c b/libavutil/hwcontext_nvtegra.c new file mode 100644 index 0000000000..0f4d5a323b --- /dev/null +++ b/libavutil/hwcontext_nvtegra.c @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2024 averne <averne381@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdbool.h> + +#include "config.h" +#include "pixdesc.h" +#include "imgutils.h" +#include "internal.h" +#include "mem.h" +#include "time.h" + +#include "hwcontext.h" +#include "hwcontext_internal.h" + +#include "nvhost_ioctl.h" +#include "nvmap_ioctl.h" +#include "nvtegra_host1x.h" +#include "clb0b6.h" +#include "vic_drv.h" + +#include "hwcontext_nvtegra.h" + +typedef struct NVTegraDevicePriv { + /* The public AVNVTegraDeviceContext */ + AVNVTegraDeviceContext p; + + AVBufferRef *driver_state_ref; + + AVNVTegraJobPool job_pool; + uint32_t vic_setup_off, vic_cmdbuf_off; +} NVTegraDevicePriv; + +static const enum AVPixelFormat supported_sw_formats[] = { + AV_PIX_FMT_GRAY8, + AV_PIX_FMT_NV12, + AV_PIX_FMT_P010, + AV_PIX_FMT_YUV420P, +}; + +int av_nvtegra_pixfmt_to_vic(enum AVPixelFormat fmt) { + switch (fmt) { + case AV_PIX_FMT_GRAY8: + return NVB0B6_T_L8; + case AV_PIX_FMT_NV12: + return NVB0B6_T_Y8___U8V8_N420; + case AV_PIX_FMT_YUV420P: + return NVB0B6_T_Y8___U8___V8_N420; + case AV_PIX_FMT_RGB565: + return NVB0B6_T_R5G6B5; + case AV_PIX_FMT_RGB32: + return NVB0B6_T_A8R8G8B8; + case AV_PIX_FMT_BGR32: + return NVB0B6_T_A8B8G8R8; + case AV_PIX_FMT_RGB32_1: + return NVB0B6_T_R8G8B8A8; + case AV_PIX_FMT_BGR32_1: + return NVB0B6_T_B8G8R8A8; + case AV_PIX_FMT_0RGB32: + return NVB0B6_T_X8R8G8B8; + case AV_PIX_FMT_0BGR32: + return NVB0B6_T_X8B8G8R8; + default: + return -1; + } +} + +static inline uint32_t nvtegra_surface_get_width_align(enum AVPixelFormat fmt, const AVComponentDescriptor *comp) { + int step = comp->step; + + if (fmt != AV_PIX_FMT_NVTEGRA) + return 256 / step; /* Pitch linear surfaces must be aligned to 256B for VIC */ + + /* + * GOBs are 64B wide. + * In addition, we use a 32Bx8 cache width in VIC for block linear surfaces. + */ + return 64 / step; +} + +static inline uint32_t nvtegra_surface_get_height_align(enum AVPixelFormat fmt, const AVComponentDescriptor *comp) { + /* Height alignment is in terms of lines, not bytes, therefore we don't divide by the sample step */ + if (fmt != AV_PIX_FMT_NVTEGRA) + return 4; /* We use 64Bx4 cache width in VIC for pitch linear surfaces */ + + /* + * GOBs are 8B high, and we use a GOB height of 2. + * In addition, we use a 32Bx8 cache width in VIC for block linear surfaces. + * We double this requirement to make sure it is respected for the subsampled chroma plane. + */ + return 32; +} + +static void nvtegra_device_uninit(AVHWDeviceContext *ctx) { + NVTegraDevicePriv *priv = ctx->hwctx; + AVNVTegraDeviceContext *hwctx = &priv->p; + + av_log(ctx, AV_LOG_DEBUG, "Deinitializing NVTEGRA device\n"); + + av_nvtegra_job_pool_uninit(&priv->job_pool); + + if (hwctx->nvdec_version) { + av_nvtegra_channel_close(&hwctx->nvdec_channel); +#ifdef __SWITCH__ + mmuRequestFinalize(&hwctx->nvdec_channel.mmu_request); +#endif + } + + if (hwctx->nvjpg_version) { + av_nvtegra_channel_close(&hwctx->nvjpg_channel); +#ifdef __SWITCH__ + mmuRequestFinalize(&hwctx->nvjpg_channel.mmu_request); +#endif + } + + av_nvtegra_channel_close(&hwctx->vic_channel); + + av_buffer_unref(&priv->driver_state_ref); +} + +/* + * Hardware modules on the Tegra X1 (see t210.c in l4t kernel sources) + * - nvdec v2.0 + * - nvenc v5.0 + * - nvjpg v1.0 + * - vic v4.0 + */ + +static int nvtegra_device_init(AVHWDeviceContext *ctx) { + NVTegraDevicePriv *priv = ctx->hwctx; + AVNVTegraDeviceContext *hwctx = &priv->p; + + uint32_t vic_map_size; + int err; + + av_log(ctx, AV_LOG_DEBUG, "Initializing NVTEGRA device\n"); + + err = av_nvtegra_channel_open(&hwctx->nvdec_channel, "/dev/nvhost-nvdec"); + if (!err) + hwctx->nvdec_version = AV_NVTEGRA_ENCODE_REV(2,0); + + err = av_nvtegra_channel_open(&hwctx->nvjpg_channel, "/dev/nvhost-nvjpg"); + if (!err) + hwctx->nvjpg_version = AV_NVTEGRA_ENCODE_REV(1,0); + + err = av_nvtegra_channel_open(&hwctx->vic_channel, "/dev/nvhost-vic"); + if (err < 0) + goto fail; + + hwctx->vic_version = AV_NVTEGRA_ENCODE_REV(4,0); + + /* Note: Official code only sets this for the nvdec channel */ + if (hwctx->nvdec_version) { + err = av_nvtegra_channel_set_submit_timeout(&hwctx->nvdec_channel, 1000); + if (err < 0) + goto fail; + } + + if (hwctx->nvjpg_version) { + err = av_nvtegra_channel_set_submit_timeout(&hwctx->nvjpg_channel, 1000); + if (err < 0) + goto fail; + } + + priv->vic_setup_off = 0; + priv->vic_cmdbuf_off = FFALIGN(priv->vic_setup_off + sizeof(VicConfigStruct), + AV_NVTEGRA_MAP_ALIGN); + vic_map_size = FFALIGN(priv->vic_cmdbuf_off + AV_NVTEGRA_MAP_ALIGN, + 0x1000); + + err = av_nvtegra_job_pool_init(&priv->job_pool, &hwctx->vic_channel, vic_map_size, + priv->vic_cmdbuf_off, vic_map_size - priv->vic_cmdbuf_off); + if (err < 0) + goto fail; + +#ifndef __SWITCH__ + hwctx->nvdec_channel.module_id = 0x75; + hwctx->nvjpg_channel.module_id = 0x76; +#else + /* + * The NVHOST_IOCTL_CHANNEL_SET_CLK_RATE ioctl also exists on HOS but the clock rate + * will be reset when the console goes to sleep. + */ + if (hwctx->nvdec_version) { + err = AVERROR(mmuRequestInitialize(&hwctx->nvdec_channel.mmu_request, (MmuModuleId)5, 8, false)); + if (err < 0) + goto fail; + } + + if (hwctx->nvjpg_version) { + err = AVERROR(mmuRequestInitialize(&hwctx->nvjpg_channel.mmu_request, MmuModuleId_Nvjpg, 8, false)); + if (err < 0) + goto fail; + } +#endif + + return 0; + +fail: + nvtegra_device_uninit(ctx); + return err; +} + +static int nvtegra_device_create(AVHWDeviceContext *ctx, const char *device, + AVDictionary *opts, int flags) +{ + NVTegraDevicePriv *priv = ctx->hwctx; + + av_log(ctx, AV_LOG_DEBUG, "Creating NVTEGRA device\n"); + + priv->driver_state_ref = av_nvtegra_driver_init(); + if (!priv->driver_state_ref) { + av_log(ctx, AV_LOG_ERROR, "Failed to create driver context, " + "make sure you are using a Tegra device\n"); + return AVERROR(ENOSYS); + } + + return 0; +} + +static int nvtegra_frames_get_constraints(AVHWDeviceContext *ctx, const void *hwconfig, + AVHWFramesConstraints *constraints) +{ + av_log(ctx, AV_LOG_DEBUG, "Getting frame constraints for NVTEGRA device\n"); + + constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_sw_formats) + 1, + sizeof(*constraints->valid_sw_formats)); + if (!constraints->valid_sw_formats) + return AVERROR(ENOMEM); + + for (int i = 0; i < FF_ARRAY_ELEMS(supported_sw_formats); ++i) + constraints->valid_sw_formats[i] = supported_sw_formats[i]; + constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_sw_formats)] = AV_PIX_FMT_NONE; + + constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats)); + if (!constraints->valid_hw_formats) + return AVERROR(ENOMEM); + + constraints->valid_hw_formats[0] = AV_PIX_FMT_NVTEGRA; + constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE; + + return 0; +} + +static void nvtegra_map_free(void *opaque, uint8_t *data) { + AVNVTegraMap *map = (AVNVTegraMap *)data; + + if (!map) + return; + + av_nvtegra_map_destroy(map); + + av_freep(&map); +} + +static void nvtegra_frame_free(void *opaque, uint8_t *data) { + AVNVTegraFrame *frame = (AVNVTegraFrame *)data; + + if (!frame) + return; + + av_buffer_unref(&frame->map_ref); + + av_freep(&frame); +} + +static AVBufferRef *nvtegra_pool_alloc(void *opaque, size_t size) { + AVHWFramesContext *ctx = opaque; + AVNVTegraDeviceContext *hwctx = &((NVTegraDevicePriv *)ctx->device_ctx->hwctx)->p; + + AVBufferRef *buffer = NULL; + AVNVTegraFrame *frame = NULL; + AVNVTegraMap *map = NULL; + int err; + + av_log(ctx, AV_LOG_DEBUG, "Creating surface from NVTEGRA device\n"); + + map = av_mallocz(sizeof(*map)); + if (!map) + goto fail; + + frame = av_mallocz(sizeof(*frame)); + if (!map) + goto fail; + + /* + * Framebuffers are allocated as CPU-cacheable, since they might get copied from + * during transfer operations. Cache management is done manually. + */ + err = av_nvtegra_map_create(map, &hwctx->nvdec_channel, size, 0x100, + NVMAP_HEAP_CARVEOUT_GENERIC, NVMAP_HANDLE_CACHEABLE); + if (err < 0) + goto fail; + + /* Flush the CPU cache */ + av_nvtegra_map_cache_op(map, NVMAP_CACHE_OP_WB, av_nvtegra_map_get_addr(map), + av_nvtegra_map_get_size(map)); + + frame->map_ref = av_buffer_create((uint8_t *)map, sizeof(*map), nvtegra_map_free, ctx, 0); + if (!frame->map_ref) + goto fail; + + buffer = av_buffer_create((uint8_t *)frame, sizeof(*frame), nvtegra_frame_free, ctx, 0); + if (!buffer) + goto fail; + + return buffer; + +fail: + av_log(ctx, AV_LOG_ERROR, "Failed to create buffer\n"); + nvtegra_frame_free(opaque, (uint8_t *)frame); + return NULL; +} + +static int nvtegra_frames_init(AVHWFramesContext *ctx) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(ctx->sw_format); + + uint32_t width_aligned, height_aligned, size; + + av_log(ctx, AV_LOG_DEBUG, "Initializing frame pool for the NVTEGRA device\n"); + + if (!ctx->pool) { + width_aligned = FFALIGN(ctx->width, nvtegra_surface_get_width_align (ctx->format, &desc->comp[0])); + height_aligned = FFALIGN(ctx->height, nvtegra_surface_get_height_align(ctx->format, &desc->comp[0])); + + size = av_image_get_buffer_size(ctx->sw_format, width_aligned, height_aligned, + nvtegra_surface_get_width_align(ctx->format, &desc->comp[0])); + + ffhwframesctx(ctx)->pool_internal = av_buffer_pool_init2(size, ctx, nvtegra_pool_alloc, NULL); + if (!ffhwframesctx(ctx)->pool_internal) + return AVERROR(ENOMEM); + } + + return 0; +} + +static void nvtegra_frames_uninit(AVHWFramesContext *ctx) { + av_log(ctx, AV_LOG_DEBUG, "Deinitializing frame pool for the NVTEGRA device\n"); +} + +static int nvtegra_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(ctx->sw_format); + + AVNVTegraMap *map; + uint32_t width_aligned, height_aligned; + int err; + + av_log(ctx, AV_LOG_DEBUG, "Getting frame buffer for NVTEGRA device\n"); + + frame->buf[0] = av_buffer_pool_get(ctx->pool); + if (!frame->buf[0]) + return AVERROR(ENOMEM); + + map = av_nvtegra_frame_get_fbuf_map(frame); + + width_aligned = FFALIGN(ctx->width, nvtegra_surface_get_width_align (ctx->format, &desc->comp[0])); + height_aligned = FFALIGN(ctx->height, nvtegra_surface_get_height_align(ctx->format, &desc->comp[0])); + + err = av_image_fill_arrays(frame->data, frame->linesize, av_nvtegra_map_get_addr(map), + ctx->sw_format, width_aligned, height_aligned, + nvtegra_surface_get_width_align(ctx->format, &desc->comp[0])); + if (err < 0) + return err; + + frame->format = AV_PIX_FMT_NVTEGRA; + frame->width = ctx->width; + frame->height = ctx->height; + + return 0; +} + +static int nvtegra_transfer_get_formats(AVHWFramesContext *ctx, + enum AVHWFrameTransferDirection dir, + enum AVPixelFormat **formats) +{ + enum AVPixelFormat *fmts; + + av_log(ctx, AV_LOG_DEBUG, "Getting transfer formats for NVTEGRA device\n"); + + fmts = av_malloc_array(2, sizeof(**formats)); + if (!fmts) + return AVERROR(ENOMEM); + + fmts[0] = ctx->sw_format; + fmts[1] = AV_PIX_FMT_NONE; + + *formats = fmts; + return 0; +} + +static inline void nvtegra_cpu_copy_plane(void *dst, int dst_stride, + void *src, int src_stride, int h, bool from) +{ + /* + * Adapted from https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/. + * We process 16x2 bytes at a time. Horizontally, this is the size of a linear atom + * in a 16Bx2 sector, conveniently also the size of a cache line and of a macroblock. + * + * NVDEC always uses a GOB height of 2 (block height of 16, in line with macroblock dimensions). + * The corresponding swizzling pattern is the following: + * y3 y2 y1 y0 x5 x4 x3 x2 x1 x0 + * x: ___x5_______x4____x3 x3 x1 x0 + * y: y3____y2 y1____y0____________ + * + * Addresses for the 4 lower bits can then be copied as-is (16 bytes). + * As a further optimization, the y0 bit is also handled within the same inner loop, + * which halves the total number of iterations. + * + * This function is declared inline with the expectation that the compiler will optimize + * the branches depending on the copy direction. + */ + + __uint128_t *src_ = src, *dst_ = dst, *src_line, *dst_line; + uint32_t ws = src_stride / sizeof(__uint128_t), wd = dst_stride / sizeof(__uint128_t), + w = FFMIN(ws, wd), offs_x = 0, offs_y = 0, offs_line; + uint32_t x_mask = -0x2e, y_mask = 0x2c; + int x, y; + + for (y = 0; y < h; y += 2) { + dst_line = dst_ + (from ? y * wd : offs_y); + src_line = src_ + (from ? offs_y : y * ws); + + offs_line = offs_x; + for (x = 0; x < w; ++x) { + dst_line[from ? x+0 : offs_line+0] = src_line[from ? offs_line+0 : x+0 ]; + dst_line[from ? x+wd : offs_line+1] = src_line[from ? offs_line+1 : x+ws]; + offs_line = (offs_line - x_mask) & x_mask; + } + + offs_y = (offs_y - y_mask) & y_mask; + + /* Wrap into next tile row */ + if (!offs_y) + offs_x += from ? src_stride : dst_stride; + } +} + +static int nvtegra_cpu_transfer_data(AVHWFramesContext *ctx, const AVFrame *dst, const AVFrame *src, + int num_planes, bool from) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(ctx->sw_format); + const AVFrame *hwframe, *swframe; + AVNVTegraMap *map; + int h, i; + + hwframe = from ? src : dst, swframe = from ? dst : src; + map = av_nvtegra_frame_get_fbuf_map(hwframe); + + if (swframe->format != ctx->sw_format) { + av_log(ctx, AV_LOG_ERROR, "Source and destination must have the same format for cpu transfers\n"); + return AVERROR(EINVAL); + } + + /* If we are transferring from a hardware frame, invalidate the CPU cache which might be stale */ + if (from) { + av_nvtegra_map_cache_op(map, NVMAP_CACHE_OP_INV, + av_nvtegra_map_get_addr(map), av_nvtegra_map_get_size(map)); + } + + /* Align the height to an even size */ + h = FFALIGN(dst->height, 2); + + for (i = 0; i < num_planes; ++i) { + if (map->is_linear) { + av_image_copy_plane(dst->data[i], dst->linesize[i], src->data[i], src->linesize[i], + FFMIN(dst->linesize[i], src->linesize[i]), + h >> (i ? desc->log2_chroma_h : 0)); + } else { + /* + * Instanciate the same inlined function for both destinations, + * giving the compiler the opportunity to remove branching within the copy loops. + * (verified by decompilation at -O1 and higher for both gcc and clang) + */ + if (from) + nvtegra_cpu_copy_plane(dst->data[i], dst->linesize[i], src->data[i], src->linesize[i], + h >> (i ? desc->log2_chroma_h : 0), true); + else + nvtegra_cpu_copy_plane(dst->data[i], dst->linesize[i], src->data[i], src->linesize[i], + h >> (i ? desc->log2_chroma_h : 0), false); + } + } + + /* If we transferred to a hardware frame, flush the CPU cache to make the data visible to hardware engines */ + if (!from) { + av_nvtegra_map_cache_op(map, NVMAP_CACHE_OP_WB, + av_nvtegra_map_get_addr(map), av_nvtegra_map_get_size(map)); + } + + return 0; +} + +static void nvtegra_vic_preprare_config(VicConfigStruct *config, const AVFrame *src, const AVFrame *dst, + enum AVPixelFormat fmt, bool is_16b_chroma) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + bool input_linear = (src->format != AV_PIX_FMT_NVTEGRA) || av_nvtegra_frame_get_fbuf_map(src)->is_linear, + output_linear = (dst->format != AV_PIX_FMT_NVTEGRA) || av_nvtegra_frame_get_fbuf_map(dst)->is_linear; + + /* + * The VIC engine has an undocumented limitation regarding height alignment, + * which should be padded to an even size. + */ + + /* Subsampled dimensions when emulating 16-bit chroma transfers, as input is always NV12 */ + int divider = !is_16b_chroma ? 1 : 2; + int src_width = src->width / divider, src_height = FFALIGN(src->height, 2) / divider, + dst_width = dst->width / divider, dst_height = FFALIGN(dst->height, 2) / divider; + + *config = (VicConfigStruct){ + .pipeConfig = { + .DownsampleHoriz = 1 << 2, /* U9.2 */ + .DownsampleVert = 1 << 2, /* U9.2 */ + }, + .outputConfig = { + .AlphaFillMode = !is_16b_chroma ? NVB0B6_DXVAHD_ALPHA_FILL_MODE_OPAQUE : + NVB0B6_DXVAHD_ALPHA_FILL_MODE_SOURCE_STREAM, + .BackgroundAlpha = 0, + .BackgroundR = 0, + .BackgroundG = 0, + .BackgroundB = 0, + .TargetRectLeft = 0, + .TargetRectRight = dst_width - 1, + .TargetRectTop = 0, + .TargetRectBottom = dst_height - 1, + }, + .outputSurfaceConfig = { + .OutPixelFormat = av_nvtegra_pixfmt_to_vic(fmt), + .OutSurfaceWidth = dst_width - 1, + .OutSurfaceHeight = dst_height - 1, + .OutBlkKind = !output_linear ? NVB0B6_BLK_KIND_GENERIC_16Bx2 : NVB0B6_BLK_KIND_PITCH, + .OutBlkHeight = !output_linear ? 1 : 0, /* GOB height 2 */ + .OutLumaWidth = (dst->linesize[0] / desc->comp[0].step) - 1, + .OutLumaHeight = FFALIGN(dst_height, !output_linear ? 32 : 2) - 1, + .OutChromaWidth = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? + -1 : (dst->linesize[1] / desc->comp[1].step) - 1, + .OutChromaHeight = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? -1 : + (FFALIGN(dst_height, !output_linear ? 32 : 2) >> desc->log2_chroma_h) - 1, + }, + .slotStruct = { + { + .slotConfig = { + .SlotEnable = 1, + .CurrentFieldEnable = 1, + .SoftClampLow = 0, + .SoftClampHigh = 1023, + .PlanarAlpha = 1023, + .ConstantAlpha = 1, + .SourceRectLeft = 0, + .SourceRectRight = (src_width - 1) << 16, /* U14.16 (for subpixel positioning) */ + .SourceRectTop = 0, + .SourceRectBottom = (src_height - 1) << 16, + .DestRectLeft = 0, + .DestRectRight = src_width - 1, + .DestRectTop = 0, + .DestRectBottom = src_height - 1, + }, + .slotSurfaceConfig = { + .SlotPixelFormat = av_nvtegra_pixfmt_to_vic(fmt), + .SlotChromaLocHoriz = ((desc->flags & AV_PIX_FMT_FLAG_RGB) || + src->chroma_location == AVCHROMA_LOC_TOPLEFT || + src->chroma_location == AVCHROMA_LOC_LEFT || + src->chroma_location == AVCHROMA_LOC_BOTTOMLEFT) ? 0 : 1, + .SlotChromaLocVert = ((desc->flags & AV_PIX_FMT_FLAG_RGB) || + src->chroma_location == AVCHROMA_LOC_TOPLEFT || + src->chroma_location == AVCHROMA_LOC_TOP) ? 0 : + (src->chroma_location == AVCHROMA_LOC_LEFT || + src->chroma_location == AVCHROMA_LOC_CENTER) ? 1 : 2, + .SlotBlkKind = !input_linear ? NVB0B6_BLK_KIND_GENERIC_16Bx2 : NVB0B6_BLK_KIND_PITCH, + .SlotBlkHeight = !input_linear ? 1 : 0, /* GOB height 2 */ + .SlotCacheWidth = !input_linear ? 1 : 3, /* 32Bx8 for block, 128Bx2 for pitch */ + .SlotSurfaceWidth = src_width - 1, + .SlotSurfaceHeight = src_height - 1, + .SlotLumaWidth = (src->linesize[0] / desc->comp[0].step) - 1, + .SlotLumaHeight = FFALIGN(src_height, !input_linear ? 32 : 2) - 1, + .SlotChromaWidth = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? + -1 : (src->linesize[1] / desc->comp[1].step) - 1, + .SlotChromaHeight = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? -1 : + (FFALIGN(src_height, !input_linear ? 32 : 2) >> desc->log2_chroma_h) - 1, + }, + }, + }, + }; +} + +static int nvtegra_vic_prepare_cmdbuf(AVHWFramesContext *ctx, AVNVTegraJobPool *pool, AVNVTegraJob *job, + const AVFrame *src, const AVFrame *dst, enum AVPixelFormat fmt, + AVNVTegraMap **plane_maps, uint32_t *plane_offsets, int num_planes) +{ + NVTegraDevicePriv *priv = ctx->device_ctx->hwctx; + AVNVTegraCmdbuf *cmdbuf = &job->cmdbuf; + + AVNVTegraMap *src_maps[4], *dst_maps[4]; + uint32_t src_map_offsets[4], dst_map_offsets[4]; + int src_reloc_type, dst_reloc_type, i, err; + +#define RELOC_VARS(frame) ({ \ + if (frame->format == AV_PIX_FMT_NVTEGRA) { \ + for (i = 0; i < FF_ARRAY_ELEMS(AV_JOIN(frame, _map_offsets)); ++i) { \ + AV_JOIN(frame, _maps )[i] = av_nvtegra_frame_get_fbuf_map(frame); \ + AV_JOIN(frame, _map_offsets)[i] = frame->data[i] - frame->data[0]; \ + } \ + AV_JOIN(frame, _reloc_type) = !av_nvtegra_frame_get_fbuf_map(frame)->is_linear ? \ + NVHOST_RELOC_TYPE_BLOCK_LINEAR : NVHOST_RELOC_TYPE_PITCH_LINEAR; \ + } else { \ + for (i = 0; i < FF_ARRAY_ELEMS(AV_JOIN(frame, _map_offsets)); ++i) { \ + AV_JOIN(frame, _maps )[i] = plane_maps [i]; \ + AV_JOIN(frame, _map_offsets)[i] = plane_offsets[i]; \ + } \ + AV_JOIN(frame, _reloc_type) = NVHOST_RELOC_TYPE_PITCH_LINEAR; \ + } \ +}) + + RELOC_VARS(src); + RELOC_VARS(dst); + + err = av_nvtegra_cmdbuf_begin(cmdbuf, HOST1X_CLASS_VIC); + if (err < 0) + return err; + + AV_NVTEGRA_PUSH_VALUE(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_CONTROL_PARAMS, + AV_NVTEGRA_VALUE(NVB0B6_VIDEO_COMPOSITOR_SET_CONTROL_PARAMS, CONFIG_STRUCT_SIZE, sizeof(VicConfigStruct) >> 4) | + AV_NVTEGRA_VALUE(NVB0B6_VIDEO_COMPOSITOR_SET_CONTROL_PARAMS, GPTIMER_ON, 1) | + AV_NVTEGRA_VALUE(NVB0B6_VIDEO_COMPOSITOR_SET_CONTROL_PARAMS, FALCON_CONTROL, 1)); + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_CONFIG_STRUCT_OFFSET, + &job->input_map, priv->vic_setup_off, NVHOST_RELOC_TYPE_DEFAULT); + + switch (fmt) { + /* 16-bit transfer emulation */ + case AV_PIX_FMT_RGB565: + /* Luma transfer */ + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_SURFACE0_LUMA_OFFSET(0), + src_maps[0], src_map_offsets[0], src_reloc_type); + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_OUTPUT_SURFACE_LUMA_OFFSET, + dst_maps[0], dst_map_offsets[0], dst_reloc_type); + break; + case AV_PIX_FMT_RGB32: + /* Chroma transfer */ + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_SURFACE0_LUMA_OFFSET(0), + src_maps[1], src_map_offsets[1], src_reloc_type); + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_OUTPUT_SURFACE_LUMA_OFFSET, + dst_maps[1], dst_map_offsets[1], dst_reloc_type); + break; + + /* Normal transfers */ + case AV_PIX_FMT_GRAY8: + case AV_PIX_FMT_NV12: + case AV_PIX_FMT_YUV420P: + for (i = 0; i < num_planes; ++i) { + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_SURFACE0_LUMA_OFFSET(0) + i * sizeof(uint32_t), + src_maps[i], src_map_offsets[i], src_reloc_type); + AV_NVTEGRA_PUSH_RELOC(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_SET_OUTPUT_SURFACE_LUMA_OFFSET + i * sizeof(uint32_t), + dst_maps[i], dst_map_offsets[i], dst_reloc_type); + } + break; + default: + return AVERROR(EINVAL); + } + + AV_NVTEGRA_PUSH_VALUE(cmdbuf, NVB0B6_VIDEO_COMPOSITOR_EXECUTE, + AV_NVTEGRA_ENUM(NVB0B6_VIDEO_COMPOSITOR_EXECUTE, AWAKEN, ENABLE)); + + err = av_nvtegra_cmdbuf_add_syncpt_incr(cmdbuf, pool->channel->syncpt, 0); + if (err < 0) + return err; + + err = av_nvtegra_cmdbuf_end(cmdbuf); + if (err < 0) + return err; + + return 0; +} + +static int nvtegra_vic_copy_plane(AVHWFramesContext *ctx, AVNVTegraJob *job, + const AVFrame *src, const AVFrame *dst, + enum AVPixelFormat fmt, AVNVTegraMap **plane_maps, uint32_t *plane_offsets, + int num_planes, bool is_chroma) +{ + NVTegraDevicePriv *priv = ctx->device_ctx->hwctx; + + uint8_t *mem; + int err; + + mem = av_nvtegra_map_get_addr(&job->input_map); + + nvtegra_vic_preprare_config((VicConfigStruct *)(mem + priv->vic_setup_off), + src, dst, fmt, is_chroma); + + err = av_nvtegra_cmdbuf_clear(&job->cmdbuf); + if (err < 0) + return err; + + err = nvtegra_vic_prepare_cmdbuf(ctx, &priv->job_pool, job, src, dst, fmt, + plane_maps, plane_offsets, num_planes); + if (err < 0) + goto fail; + + err = av_nvtegra_job_submit(&priv->job_pool, job); + if (err < 0) + goto fail; + + err = av_nvtegra_job_wait(&priv->job_pool, job, -1); + if (err < 0) + goto fail; + +fail: + return err; +} + +static int nvtegra_vic_transfer_data(AVHWFramesContext *ctx, const AVFrame *dst, const AVFrame *src, + int num_planes, bool from) +{ + NVTegraDevicePriv *priv = ctx->device_ctx->hwctx; + AVNVTegraDeviceContext *hwctx = &priv->p; + + AVBufferRef *job_ref; + AVNVTegraJob *job; + const AVFrame *swframe; + uint8_t *map_bases[4]; + AVNVTegraMap maps[4] = {0}; + AVNVTegraMap *plane_maps[4]; + uint32_t plane_offsets[4]; + int num_maps, i, j, err; + + swframe = from ? dst : src; + + job_ref = av_nvtegra_job_pool_get(&priv->job_pool); + if (!job_ref) { + err = AVERROR(ENOMEM); + goto fail; + } + + job = (AVNVTegraJob *)job_ref->data; + + /* Create a map for each frame backing buffer */ + for (i = 0; i < FF_ARRAY_ELEMS(maps); num_maps = ++i) { + if (!swframe->buf[i]) + break; + + /* + * In order to avoid a full-frame copy on the CPU, the provided memory + * is mapped into VIC and used directly during the transfer. + * The address and size are aligned to page boundaries. + * Cache management is performed manually to not affect data outside the buffer. + */ + map_bases[i] = (uint8_t *)((uintptr_t)swframe->buf[i]->data & ~0xfff); + err = av_nvtegra_map_from_va(&maps[i], &hwctx->vic_channel, map_bases[i], + swframe->buf[i]->size + ((uintptr_t)swframe->buf[i]->data & 0xfff), + 0x100, NVMAP_HANDLE_CACHEABLE); + if (err < 0) + goto fail; + + err = av_nvtegra_map_map(&maps[i]); + if (err < 0) + goto fail; + + /* Flush-invalidate the CPU cache prior to the transfer */ + av_nvtegra_map_cache_op(&maps[i], NVMAP_CACHE_OP_WB_INV, + ((uint8_t *)av_nvtegra_map_get_addr(&maps[i])) + + ((uintptr_t)swframe->buf[i]->data & 0xfff), + swframe->buf[i]->size); + } + + /* Find the corresponding map object and its offset for each plane */ + for (i = 0; i < num_planes; ++i) { + for (j = 0; j < FF_ARRAY_ELEMS(swframe->buf); ++j) { + if ((swframe->buf[j]->data <= swframe->data[i]) && + (swframe->data[i] < swframe->buf[j]->data + swframe->buf[j]->size)) + break; + } + + plane_maps [i] = &maps[j]; + plane_offsets[i] = swframe->data[i] - map_bases[j]; + } + + /* VIC expects planes in the reversed order */ + if (swframe->format == AV_PIX_FMT_YUV420P) { + FFSWAP(AVNVTegraMap *, plane_maps [1], plane_maps [2]); + FFSWAP(uint32_t, plane_offsets[1], plane_offsets[2]); + } + + /* + * VIC2 does not support 16-bit YUV surfaces (eg. P010, P012, ...). + * Here we emulate them using two separates transfers for the luma and chroma planes + * (16-bit and 32-bit widths respectively). + */ + if (swframe->format == AV_PIX_FMT_P010) { + err = nvtegra_vic_copy_plane(ctx, job, src, dst, AV_PIX_FMT_RGB565, + plane_maps, plane_offsets, 1, false); + if (err < 0) + goto fail; + + err = nvtegra_vic_copy_plane(ctx, job, src, dst, AV_PIX_FMT_RGB32, + plane_maps, plane_offsets, 1, true); + if (err < 0) + goto fail; + } else { + err = nvtegra_vic_copy_plane(ctx, job, src, dst, swframe->format, + plane_maps, plane_offsets, num_planes, false); + if (err < 0) + goto fail; + } + +fail: + for (i = 0; i < num_maps; ++i) { + av_nvtegra_map_unmap(&maps[i]); + av_nvtegra_map_close(&maps[i]); + } + + av_buffer_unref(&job_ref); + + return err; +} + +static int nvtegra_transfer_data(AVHWFramesContext *ctx, AVFrame *dst, const AVFrame *src) { + const AVFrame *swframe; + bool from; + int num_planes, i; + + from = !dst->hw_frames_ctx; + swframe = from ? dst : src; + + if (swframe->hw_frames_ctx) + return AVERROR(ENOSYS); + + num_planes = av_pix_fmt_count_planes(swframe->format); + + for (i = 0; i < num_planes; ++i) { + if (((uintptr_t)swframe->data[i] & 0xff) || (swframe->linesize[i] & 0xff)) { + av_log(ctx, AV_LOG_WARNING, "Frame address/pitch not aligned to 256, " + "falling back to cpu transfer\n"); + return nvtegra_cpu_transfer_data(ctx, dst, src, num_planes, from); + } + } + + return nvtegra_vic_transfer_data(ctx, dst, src, num_planes, from); +} + +const HWContextType ff_hwcontext_type_nvtegra = { + .type = AV_HWDEVICE_TYPE_NVTEGRA, + .name = "nvtegra", + + .device_hwctx_size = sizeof(NVTegraDevicePriv), + .device_hwconfig_size = 0, + .frames_hwctx_size = 0, + + .device_create = &nvtegra_device_create, + .device_init = &nvtegra_device_init, + .device_uninit = &nvtegra_device_uninit, + + .frames_get_constraints = &nvtegra_frames_get_constraints, + .frames_init = &nvtegra_frames_init, + .frames_uninit = &nvtegra_frames_uninit, + .frames_get_buffer = &nvtegra_get_buffer, + + .transfer_get_formats = &nvtegra_transfer_get_formats, + .transfer_data_to = &nvtegra_transfer_data, + .transfer_data_from = &nvtegra_transfer_data, + + .pix_fmts = (const enum AVPixelFormat[]) { + AV_PIX_FMT_NVTEGRA, + AV_PIX_FMT_NONE, + }, +}; diff --git a/libavutil/hwcontext_nvtegra.h b/libavutil/hwcontext_nvtegra.h new file mode 100644 index 0000000000..8a2383d304 --- /dev/null +++ b/libavutil/hwcontext_nvtegra.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024 averne <averne381@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef AVUTIL_HWCONTEXT_NVTEGRA_H +#define AVUTIL_HWCONTEXT_NVTEGRA_H + +#include <stdint.h> + +#include "hwcontext.h" +#include "buffer.h" +#include "frame.h" +#include "pixfmt.h" + +#include "nvtegra.h" + +/* + * Encode a hardware revision into a version number + */ +#define AV_NVTEGRA_ENCODE_REV(maj, min) (((maj & 0xff) << 8) | (min & 0xff)) + +/* + * Decode a version number + */ +static inline void av_nvtegra_decode_rev(int rev, int *maj, int *min) { + *maj = (rev >> 8) & 0xff; + *min = (rev >> 0) & 0xff; +} + +/** + * @file + * API-specific header for AV_HWDEVICE_TYPE_NVTEGRA. + * + * For user-allocated pools, AVHWFramesContext.pool must return AVBufferRefs + * with the data pointer set to an AVNVTegraMap. + */ + +typedef struct AVNVTegraDeviceContext { + /* + * Hardware multimedia engines + */ + AVNVTegraChannel nvdec_channel, nvenc_channel, nvjpg_channel, vic_channel; + + /* + * Hardware revisions for associated engines, or 0 if invalid + */ + int nvdec_version, nvenc_version, nvjpg_version, vic_version; +} AVNVTegraDeviceContext; + +typedef struct AVNVTegraFrame { + /* + * Reference to an AVNVTegraMap object + */ + AVBufferRef *map_ref; +} AVNVTegraFrame; + +/* + * Helper to retrieve a map object from the corresponding frame + */ +static inline AVNVTegraMap *av_nvtegra_frame_get_fbuf_map(const AVFrame *frame) { + return (AVNVTegraMap *)((AVNVTegraFrame *)frame->buf[0]->data)->map_ref->data; +} + +/* + * Converts a pixel format to the equivalent code for the VIC engine + */ +int av_nvtegra_pixfmt_to_vic(enum AVPixelFormat fmt); + +#endif /* AVUTIL_HWCONTEXT_NVTEGRA_H */

[FFmpeg-devel,06/16] avutil: add nvtegra hwcontext

Checks

Commit Message

Comments

Patch