diff mbox series

[FFmpeg-devel,05/16] avutil: add common code for nvtegra

Message ID eed9e71f25cf254d00fd9532781c318016580001.1717083799.git.averne381@gmail.com
State New
Headers show
Series NVidia Tegra hardware decoding backend | expand

Checks

Context Check Description
andriy/commit_msg_x86 warning Please wrap lines in the body of the commit message between 60 and 72 characters.
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

averne May 30, 2024, 7:43 p.m. UTC
This includes a new pixel format for nvtegra hardware frames, and several objects for interaction with hardware blocks.
In particular, this contains code for channels (handles to hardware engines), maps (memory-mapped buffers shared with engines), and command buffers (abstraction for building command lists sent to the engines).

Signed-off-by: averne <averne381@gmail.com>
---
 configure                  |    2 +
 libavutil/Makefile         |    4 +
 libavutil/nvtegra.c        | 1035 ++++++++++++++++++++++++++++++++++++
 libavutil/nvtegra.h        |  258 +++++++++
 libavutil/nvtegra_host1x.h |   94 ++++
 libavutil/pixdesc.c        |    4 +
 libavutil/pixfmt.h         |    8 +
 7 files changed, 1405 insertions(+)
 create mode 100644 libavutil/nvtegra.c
 create mode 100644 libavutil/nvtegra.h
 create mode 100644 libavutil/nvtegra_host1x.h

Comments

Rémi Denis-Courmont May 31, 2024, 8:32 a.m. UTC | #1
Le 30 mai 2024 22:43:07 GMT+03:00, averne <averne381@gmail.com> a écrit :
>This includes a new pixel format for nvtegra hardware frames, and several objects for interaction with hardware blocks.
>In particular, this contains code for channels (handles to hardware engines), maps (memory-mapped buffers shared with engines), and command buffers (abstraction for building command lists sent to the engines).
>
>Signed-off-by: averne <averne381@gmail.com>
>---
> configure                  |    2 +
> libavutil/Makefile         |    4 +
> libavutil/nvtegra.c        | 1035 ++++++++++++++++++++++++++++++++++++
> libavutil/nvtegra.h        |  258 +++++++++
> libavutil/nvtegra_host1x.h |   94 ++++
> libavutil/pixdesc.c        |    4 +
> libavutil/pixfmt.h         |    8 +
> 7 files changed, 1405 insertions(+)
> create mode 100644 libavutil/nvtegra.c
> create mode 100644 libavutil/nvtegra.h
> create mode 100644 libavutil/nvtegra_host1x.h
>
>diff --git a/configure b/configure
>index 09fb2aed1b..51f169bfbd 100755
>--- a/configure
>+++ b/configure
>@@ -361,6 +361,7 @@ External library support:
>   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
>   --disable-videotoolbox   disable VideoToolbox code [autodetect]
>   --disable-vulkan         disable Vulkan code [autodetect]
>+  --enable-nvtegra         enable nvtegra code [no]
> 
> Toolchain options:
>   --arch=ARCH              select architecture [$arch]
>@@ -3151,6 +3152,7 @@ videotoolbox_hwaccel_deps="videotoolbox pthreads"
> videotoolbox_hwaccel_extralibs="-framework QuartzCore"
> vulkan_deps="threads"
> vulkan_deps_any="libdl LoadLibrary"
>+nvtegra_deps="gpl"
> 
> av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
> av1_d3d11va_hwaccel_select="av1_decoder"
>diff --git a/libavutil/Makefile b/libavutil/Makefile
>index 9c112bc58a..733a23a8a3 100644
>--- a/libavutil/Makefile
>+++ b/libavutil/Makefile
>@@ -52,6 +52,7 @@ HEADERS = adler32.h                                                     \
>           hwcontext_videotoolbox.h                                      \
>           hwcontext_vdpau.h                                             \
>           hwcontext_vulkan.h                                            \
>+          nvtegra.h                                                     \
>           nvhost_ioctl.h                                                \
>           nvmap_ioctl.h                                                 \
>           iamf.h                                                        \
>@@ -209,6 +210,7 @@ OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
> OBJS-$(CONFIG_VULKAN)                   += hwcontext_vulkan.o vulkan.o
> 
> OBJS-$(!CONFIG_VULKAN)                  += hwcontext_stub.o
>+OBJS-$(CONFIG_NVTEGRA)                  += nvtegra.o
> 
> OBJS += $(COMPAT_OBJS:%=../compat/%)
> 
>@@ -230,6 +232,8 @@ SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
> SKIPHEADERS-$(CONFIG_VULKAN)           += hwcontext_vulkan.h vulkan.h   \
>                                           vulkan_functions.h            \
>                                           vulkan_loader.h
>+SKIPHEADERS-$(CONFIG_NVTEGRA)          += nvtegra.h                     \
>+                                          nvtegra_host1x.h
> 
> TESTPROGS = adler32                                                     \
>             aes                                                         \
>diff --git a/libavutil/nvtegra.c b/libavutil/nvtegra.c
>new file mode 100644
>index 0000000000..ad0bbbdfaa
>--- /dev/null
>+++ b/libavutil/nvtegra.c
>@@ -0,0 +1,1035 @@
>+/*
>+ * Copyright (c) 2024 averne <averne381@gmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or modify
>+ * it under the terms of the GNU General Public License as published by
>+ * the Free Software Foundation; either version 2 of the License, or
>+ * (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>+ * GNU General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public License along
>+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
>+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>+ */
>+
>+#ifndef __SWITCH__
>+#   include <sys/ioctl.h>
>+#   include <sys/mman.h>
>+#   include <fcntl.h>
>+#   include <unistd.h>
>+#else
>+#   include <stdlib.h>
>+#   include <switch.h>
>+#endif
>+
>+#include <string.h>
>+
>+#include "buffer.h"
>+#include "log.h"
>+#include "error.h"
>+#include "mem.h"
>+#include "thread.h"
>+
>+#include "nvhost_ioctl.h"
>+#include "nvmap_ioctl.h"
>+#include "nvtegra_host1x.h"
>+
>+#include "nvtegra.h"
>+
>+/*
>+ * Tag used by the kernel to identify allocations.
>+ * Official software has been seen using 0x900, 0xf00, 0x1100, 0x1400, 0x4000.
>+ */
>+#define MEM_TAG (0xfeed)
>+
>+struct DriverState {
>+    int nvmap_fd, nvhost_fd;
>+};
>+
>+static AVMutex g_driver_init_mtx = AV_MUTEX_INITIALIZER;
>+static struct DriverState *g_driver_state = NULL;
>+static AVBufferRef *g_driver_state_ref = NULL;
>+
>+static void free_driver_fds(void *opaque, uint8_t *data) {
>+    if (!g_driver_state)
>+        return;
>+
>+#ifndef __SWITCH__
>+    if (g_driver_state->nvmap_fd > 0)
>+        close(g_driver_state->nvmap_fd);
>+
>+    if (g_driver_state->nvhost_fd > 0)
>+        close(g_driver_state->nvhost_fd);
>+#else
>+    nvFenceExit();
>+    nvMapExit();
>+    nvExit();
>+    mmuExit();
>+#endif
>+
>+    g_driver_init_mtx  = (AVMutex)AV_MUTEX_INITIALIZER;
>+    g_driver_state_ref = NULL;
>+    av_freep(&g_driver_state);
>+}
>+
>+static int init_driver_fds(void) {
>+    AVBufferRef *ref;
>+    struct DriverState *state;
>+    int err;
>+
>+    state = av_mallocz(sizeof(*state));
>+    if (!state)
>+        return AVERROR(ENOMEM);
>+
>+    ref = av_buffer_create((uint8_t *)state, sizeof(*state), free_driver_fds, NULL, 0);
>+    if (!state)
>+        return AVERROR(ENOMEM);
>+
>+    g_driver_state     = state;
>+    g_driver_state_ref = ref;
>+
>+#ifndef __SWITCH__
>+    err = open("/dev/nvmap", O_RDWR | O_SYNC);

There's helpers to open files, and you're missing the close on exec here. Also not clear why you need O_SYNC.

But did you consider just reimplementing libnvdec instead of putting the device driver directly in FFmpeg?

>+    if (err < 0)
>+        return AVERROR(errno);
>+    state->nvmap_fd = err;
>+
>+    err = open("/dev/nvhost-ctrl", O_RDWR | O_SYNC);
>+    if (err < 0)
>+        return AVERROR(errno);
>+    state->nvhost_fd = err;
>+#else
>+    err = nvInitialize();
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+
>+    err = nvMapInit();
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+    state->nvmap_fd = nvMapGetFd();
>+
>+    err = nvFenceInit();
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+    /* libnx doesn't export the nvhost-ctrl file descriptor */
>+
>+    err = mmuInitialize();
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+#endif
>+
>+    return 0;
>+}
>+
>+static inline int get_nvmap_fd(void) {
>+    if (!g_driver_state)
>+        return AVERROR_UNKNOWN;
>+
>+    if (!g_driver_state->nvmap_fd)
>+        return AVERROR_UNKNOWN;
>+
>+    return g_driver_state->nvmap_fd;
>+}
>+
>+static inline int get_nvhost_fd(void) {
>+    if (!g_driver_state)
>+        return AVERROR_UNKNOWN;
>+
>+    if (!g_driver_state->nvhost_fd)
>+        return AVERROR_UNKNOWN;
>+
>+    return g_driver_state->nvhost_fd;
>+}
>+
>+AVBufferRef *av_nvtegra_driver_init(void) {
>+    AVBufferRef *out = NULL;
>+    int err;
>+
>+    /*
>+     * We have to do this overly complex dance of putting driver fds in a refcounted struct,
>+     * otherwise initializing multiple hwcontexts would leak fds
>+     */
>+
>+    err = ff_mutex_lock(&g_driver_init_mtx);
>+    if (err != 0)
>+        goto exit;
>+
>+    if (g_driver_state_ref) {
>+        out = av_buffer_ref(g_driver_state_ref);
>+        goto exit;
>+    }
>+
>+    err = init_driver_fds();
>+    if (err < 0) {
>+        /* In case memory allocations failed, call the destructor ourselves */
>+        av_buffer_unref(&g_driver_state_ref);
>+        free_driver_fds(NULL, NULL);
>+        goto exit;
>+    }
>+
>+    out = g_driver_state_ref;
>+
>+exit:
>+    ff_mutex_unlock(&g_driver_init_mtx);
>+    return out;
>+}
>+
>+int av_nvtegra_channel_open(AVNVTegraChannel *channel, const char *dev) {
>+    int err;
>+#ifndef __SWITCH__
>+    struct nvhost_get_param_arg args;
>+
>+    err = open(dev, O_RDWR);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    channel->fd = err;
>+
>+    args = (struct nvhost_get_param_arg){0};
>+
>+    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_GET_SYNCPOINT, &args);
>+    if (err < 0)
>+        goto fail;
>+
>+    channel->syncpt = args.value;
>+
>+    return 0;
>+
>+fail:
>+    close(channel->fd);
>+    return AVERROR(errno);
>+#else
>+    err = nvChannelCreate(&channel->channel, dev);
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+
>+    err = nvioctlChannel_GetSyncpt(channel->channel.fd, 0, &channel->syncpt);
>+    if (R_FAILED(err))
>+        goto fail;
>+
>+    return 0;
>+
>+fail:
>+    nvChannelClose(&channel->channel);
>+    return AVERROR(err);
>+#endif
>+}
>+
>+int av_nvtegra_channel_close(AVNVTegraChannel *channel) {
>+#ifndef __SWITCH__
>+    if (!channel->fd)
>+        return 0;
>+
>+    return close(channel->fd);
>+#else
>+    nvChannelClose(&channel->channel);
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_channel_get_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t *clock_rate) {
>+    int err;
>+#ifndef __SWITCH__
>+    struct nvhost_clk_rate_args args;
>+
>+    args = (struct nvhost_clk_rate_args){
>+        .moduleid = moduleid,
>+    };
>+
>+    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_GET_CLK_RATE, &args);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    if (clock_rate)
>+        *clock_rate = args.rate;
>+
>+    return 0;
>+#else
>+    uint32_t tmp;
>+
>+    err = AVERROR(nvioctlChannel_GetModuleClockRate(channel->channel.fd, moduleid, &tmp));
>+    if (err < 0)
>+        return err;
>+
>+    if (clock_rate)
>+        *clock_rate = tmp * 1000;
>+
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_channel_set_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t clock_rate) {
>+#ifndef __SWITCH__
>+    struct nvhost_clk_rate_args args;
>+
>+    args = (struct nvhost_clk_rate_args){
>+        .rate     = clock_rate,
>+        .moduleid = moduleid,
>+    };
>+
>+    return (ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SET_CLK_RATE, &args) < 0) ? AVERROR(errno) : 0;
>+#else
>+    return AVERROR(nvioctlChannel_SetModuleClockRate(channel->channel.fd, moduleid, clock_rate / 1000));
>+#endif
>+}
>+
>+int av_nvtegra_channel_submit(AVNVTegraChannel *channel, AVNVTegraCmdbuf *cmdbuf, uint32_t *fence) {
>+    int err;
>+#ifndef __SWITCH__
>+    struct nvhost_submit_args args;
>+
>+    args = (struct nvhost_submit_args){
>+        .submit_version          = NVHOST_SUBMIT_VERSION_V2,
>+        .num_syncpt_incrs        = cmdbuf->num_syncpt_incrs,
>+        .num_cmdbufs             = cmdbuf->num_cmdbufs,
>+        .num_relocs              = cmdbuf->num_relocs,
>+        .num_waitchks            = cmdbuf->num_waitchks,
>+        .timeout                 = 0,
>+        .flags                   = 0,
>+        .fence                   = 0,
>+        .syncpt_incrs            = (uintptr_t)cmdbuf->syncpt_incrs,
>+        .cmdbuf_exts             = (uintptr_t)cmdbuf->cmdbuf_exts,
>+        .checksum_methods        = 0,
>+        .checksum_falcon_methods = 0,
>+        .pad                     = { 0 },
>+        .reloc_types             = (uintptr_t)cmdbuf->reloc_types,
>+        .cmdbufs                 = (uintptr_t)cmdbuf->cmdbufs,
>+        .relocs                  = (uintptr_t)cmdbuf->relocs,
>+        .reloc_shifts            = (uintptr_t)cmdbuf->reloc_shifts,
>+        .waitchks                = (uintptr_t)cmdbuf->waitchks,
>+        .waitbases               = 0,
>+        .class_ids               = (uintptr_t)cmdbuf->class_ids,
>+        .fences                  = (uintptr_t)cmdbuf->fences,
>+    };
>+
>+    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SUBMIT, &args);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    if (fence)
>+        *fence = args.fence;
>+
>+    return 0;
>+#else
>+    nvioctl_fence tmp;
>+
>+    err = nvioctlChannel_Submit(channel->channel.fd, (nvioctl_cmdbuf *)cmdbuf->cmdbufs, cmdbuf->num_cmdbufs,
>+                                NULL, NULL, 0, (nvioctl_syncpt_incr *)cmdbuf->syncpt_incrs, cmdbuf->num_syncpt_incrs,
>+                                &tmp, 1);
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+
>+    if (fence)
>+        *fence = tmp.value;
>+
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_channel_set_submit_timeout(AVNVTegraChannel *channel, uint32_t timeout_ms) {
>+#ifndef __SWITCH__
>+    struct nvhost_set_timeout_args args;
>+
>+    args = (struct nvhost_set_timeout_args){
>+        .timeout = timeout_ms,
>+    };
>+
>+    return (ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SET_TIMEOUT, &args) < 0) ? AVERROR(errno) : 0;
>+#else
>+    return AVERROR(nvioctlChannel_SetSubmitTimeout(channel->channel.fd, timeout_ms));
>+#endif
>+}
>+
>+int av_nvtegra_syncpt_wait(AVNVTegraChannel *channel, uint32_t threshold, int32_t timeout) {
>+#ifndef __SWITCH__
>+    struct nvhost_ctrl_syncpt_waitex_args args = {
>+        .id      = channel->syncpt,
>+        .thresh  = threshold,
>+        .timeout = timeout,
>+    };
>+
>+    return (ioctl(get_nvhost_fd(), NVHOST_IOCTL_CTRL_SYNCPT_WAITEX, &args) < 0) ? AVERROR(errno) : 0;
>+#else
>+    NvFence fence;
>+
>+    fence = (NvFence){
>+        .id    = channel->syncpt,
>+        .value = threshold,
>+    };
>+
>+    return AVERROR(nvFenceWait(&fence, timeout));
>+#endif
>+}
>+
>+#ifdef __SWITCH__
>+static inline bool convert_cache_flags(uint32_t flags) {
>+    /* Return whether the map should be CPU-cacheable */
>+    switch (flags & NVMAP_HANDLE_CACHE_FLAG) {
>+        case NVMAP_HANDLE_INNER_CACHEABLE:
>+        case NVMAP_HANDLE_CACHEABLE:
>+            return true;
>+        default:
>+            return false;
>+    }
>+}
>+#endif
>+
>+int av_nvtegra_map_allocate(AVNVTegraMap *map, AVNVTegraChannel *channel, uint32_t size,
>+                            uint32_t align, int heap_mask, int flags)
>+{
>+#ifndef __SWITCH__
>+    struct nvmap_create_handle create_args;
>+    struct nvmap_alloc_handle alloc_args;
>+    int err;
>+
>+    create_args = (struct nvmap_create_handle){
>+        .size   = size,
>+    };
>+
>+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_CREATE, &create_args);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    map->size   = size;
>+    map->handle = create_args.handle;
>+
>+    alloc_args = (struct nvmap_alloc_handle){
>+        .handle    = create_args.handle,
>+        .heap_mask = heap_mask,
>+        .flags     = flags | (MEM_TAG << 16),
>+        .align     = align,
>+    };
>+
>+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_ALLOC, &alloc_args);
>+    if (err < 0)
>+        goto fail;
>+
>+    return 0;
>+
>+fail:
>+    av_nvtegra_map_free(map);
>+    return AVERROR(errno);
>+#else
>+    void *mem;
>+
>+    map->owner = channel->channel.fd;
>+
>+    size = FFALIGN(size, 0x1000);
>+
>+    mem = aligned_alloc(FFALIGN(align, 0x1000), size);
>+    if (!mem)
>+        return AVERROR(ENOMEM);
>+
>+    return AVERROR(nvMapCreate(&map->map, mem, size, 0x10000, NvKind_Pitch,
>+                               convert_cache_flags(flags)));
>+#endif
>+}
>+
>+int av_nvtegra_map_free(AVNVTegraMap *map) {
>+#ifndef __SWITCH__
>+    int err;
>+
>+    if (!map->handle)
>+        return 0;
>+
>+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_FREE, map->handle);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    map->handle = 0;
>+
>+    return 0;
>+#else
>+    void *addr = map->map.cpu_addr;
>+
>+    if (!map->map.cpu_addr)
>+        return 0;
>+
>+    nvMapClose(&map->map);
>+    free(addr);
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_map_from_va(AVNVTegraMap *map, AVNVTegraChannel *owner, void *mem,
>+                           uint32_t size, uint32_t align, uint32_t flags)
>+{
>+#ifndef __SWITCH__
>+    struct nvmap_create_handle_from_va args;
>+    int err;
>+
>+    args = (struct nvmap_create_handle_from_va){
>+        .va    = (uintptr_t)mem,
>+        .size  = size,
>+        .flags = flags | (MEM_TAG << 16),
>+    };
>+
>+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_FROM_VA, &args);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    map->cpu_addr = mem;
>+    map->size     = size;
>+    map->handle   = args.handle;
>+
>+    return 0;
>+#else
>+
>+    map->owner = owner->channel.fd;
>+
>+    return AVERROR(nvMapCreate(&map->map, mem, FFALIGN(size, 0x1000), 0x10000, NvKind_Pitch,
>+                               convert_cache_flags(flags)));;
>+#endif
>+}
>+
>+int av_nvtegra_map_close(AVNVTegraMap *map) {
>+#ifndef __SWITCH__
>+    return av_nvtegra_map_free(map);
>+#else
>+    nvMapClose(&map->map);
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_map_map(AVNVTegraMap *map) {
>+#ifndef __SWITCH__
>+    void *addr;
>+
>+    addr = mmap(NULL, map->size, PROT_READ | PROT_WRITE, MAP_SHARED, map->handle, 0);
>+    if (addr == MAP_FAILED)
>+        return AVERROR(errno);
>+
>+    map->cpu_addr = addr;
>+
>+    return 0;
>+#else
>+    nvioctl_command_buffer_map params;
>+    int err;
>+
>+    params = (nvioctl_command_buffer_map){
>+        .handle = map->map.handle,
>+    };
>+
>+    err = nvioctlChannel_MapCommandBuffer(map->owner, &params, 1, false);
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+
>+    map->iova = params.iova;
>+
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_map_unmap(AVNVTegraMap *map) {
>+    int err;
>+#ifndef __SWITCH__
>+    if (!map->cpu_addr)
>+        return 0;
>+
>+    err = munmap(map->cpu_addr, map->size);
>+    if (err < 0)
>+        return AVERROR(errno);
>+
>+    map->cpu_addr = NULL;
>+
>+    return 0;
>+#else
>+    nvioctl_command_buffer_map params;
>+
>+    if (!map->iova)
>+        return 0;
>+
>+    params = (nvioctl_command_buffer_map){
>+        .handle = map->map.handle,
>+        .iova   = map->iova,
>+    };
>+
>+    err = nvioctlChannel_UnmapCommandBuffer(map->owner, &params, 1, false);
>+    if (R_FAILED(err))
>+        return AVERROR(err);
>+
>+    map->iova = 0;
>+
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_map_cache_op(AVNVTegraMap *map, int op, void *addr, size_t len) {
>+#ifndef __SWITCH__
>+    struct nvmap_cache_op args;
>+
>+    args = (struct nvmap_cache_op){
>+        .addr   = (uintptr_t)addr,
>+        .len    = len,
>+        .handle = av_nvtegra_map_get_handle(map),
>+        .op     = op,
>+    };
>+
>+    return AVERROR(ioctl(get_nvmap_fd(), NVMAP_IOC_CACHE, &args));
>+#else
>+    if (!map->map.is_cpu_cacheable)
>+        return 0;
>+
>+    switch (op) {
>+        case NVMAP_CACHE_OP_WB:
>+            armDCacheClean(addr, len);
>+            break;
>+        default:
>+        case NVMAP_CACHE_OP_INV:
>+        case NVMAP_CACHE_OP_WB_INV:
>+            /* libnx internally performs a clean-invalidate, since invalidate is a privileged instruction */
>+            armDCacheFlush(addr, len);
>+            break;
>+    }
>+
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_map_realloc(AVNVTegraMap *map, uint32_t size, uint32_t align,
>+                           int heap_mask, int flags)
>+{
>+    AVNVTegraChannel channel;
>+    AVNVTegraMap tmp = {0};
>+    int err;
>+
>+    if (av_nvtegra_map_get_size(map) >= size)
>+        return 0;
>+
>+    /* Dummy channel object to hold the owner fd */
>+    channel = (AVNVTegraChannel){
>+#ifdef __SWITCH__
>+        .channel.fd = map->owner,
>+#endif
>+    };
>+
>+    err = av_nvtegra_map_create(&tmp, &channel, size, align, heap_mask, flags);
>+    if (err < 0)
>+        goto fail;
>+
>+    memcpy(av_nvtegra_map_get_addr(&tmp), av_nvtegra_map_get_addr(map), av_nvtegra_map_get_size(map));
>+
>+    err = av_nvtegra_map_destroy(map);
>+    if (err < 0)
>+        goto fail;
>+
>+    *map = tmp;
>+
>+    return 0;
>+
>+fail:
>+    av_nvtegra_map_destroy(&tmp);
>+    return err;
>+}
>+
>+int av_nvtegra_cmdbuf_init(AVNVTegraCmdbuf *cmdbuf) {
>+    cmdbuf->num_cmdbufs      = 0;
>+#ifndef __SWITCH__
>+    cmdbuf->num_relocs       = 0;
>+    cmdbuf->num_waitchks     = 0;
>+#endif
>+    cmdbuf->num_syncpt_incrs = 0;
>+
>+#define NUM_INITIAL_CMDBUFS      3
>+#define NUM_INITIAL_RELOCS       15
>+#define NUM_INITIAL_SYNCPT_INCRS 3
>+
>+    cmdbuf->cmdbufs      = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->cmdbufs));
>+#ifndef __SWITCH__
>+    cmdbuf->cmdbuf_exts  = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->cmdbuf_exts));
>+    cmdbuf->class_ids    = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->class_ids));
>+#endif
>+
>+#ifndef __SWITCH__
>+    if (!cmdbuf->cmdbufs || !cmdbuf->cmdbuf_exts || !cmdbuf->class_ids)
>+#else
>+    if (!cmdbuf->cmdbufs)
>+#endif
>+        return AVERROR(ENOMEM);
>+
>+#ifndef __SWITCH__
>+    cmdbuf->relocs       = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->relocs));
>+    cmdbuf->reloc_types  = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->reloc_types));
>+    cmdbuf->reloc_shifts = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->reloc_shifts));
>+    if (!cmdbuf->relocs || !cmdbuf->reloc_types || !cmdbuf->reloc_shifts)
>+        return AVERROR(ENOMEM);
>+#endif
>+
>+    cmdbuf->syncpt_incrs = av_malloc_array(NUM_INITIAL_SYNCPT_INCRS, sizeof(*cmdbuf->syncpt_incrs));
>+#ifndef __SWITCH__
>+    cmdbuf->fences       = av_malloc_array(NUM_INITIAL_SYNCPT_INCRS, sizeof(*cmdbuf->fences));
>+#endif
>+
>+#ifndef __SWITCH__
>+    if (!cmdbuf->syncpt_incrs || !cmdbuf->fences)
>+#else
>+    if (!cmdbuf->syncpt_incrs)
>+#endif
>+        return AVERROR(ENOMEM);
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_deinit(AVNVTegraCmdbuf *cmdbuf) {
>+    av_freep(&cmdbuf->cmdbufs);
>+    av_freep(&cmdbuf->syncpt_incrs);
>+
>+#ifndef __SWITCH__
>+    av_freep(&cmdbuf->cmdbuf_exts), av_freep(&cmdbuf->class_ids);
>+    av_freep(&cmdbuf->relocs), av_freep(&cmdbuf->reloc_types), av_freep(&cmdbuf->reloc_shifts);
>+    av_freep(&cmdbuf->fences);
>+#endif
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_add_memory(AVNVTegraCmdbuf *cmdbuf, AVNVTegraMap *map, uint32_t offset, uint32_t size) {
>+    uint8_t *mem;
>+
>+    mem = av_nvtegra_map_get_addr(map);
>+
>+    cmdbuf->map        = map;
>+    cmdbuf->mem_offset = offset;
>+    cmdbuf->mem_size   = size;
>+
>+    cmdbuf->cur_word = (uint32_t *)(mem + cmdbuf->mem_offset);
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_clear(AVNVTegraCmdbuf *cmdbuf) {
>+    uint8_t *mem;
>+
>+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>+
>+    cmdbuf->num_cmdbufs = 0, cmdbuf->num_syncpt_incrs = 0;
>+#ifndef __SWITCH__
>+    cmdbuf->num_relocs = 0, cmdbuf->num_waitchks = 0;
>+#endif
>+
>+    cmdbuf->cur_word = (uint32_t *)(mem + cmdbuf->mem_offset);
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_begin(AVNVTegraCmdbuf *cmdbuf, uint32_t class_id) {
>+    uint8_t *mem;
>+    void *tmp1;
>+#ifndef __SWITCH__
>+    void *tmp2, *tmp3;
>+#endif
>+
>+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>+
>+    tmp1 = av_realloc_array(cmdbuf->cmdbufs,     cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->cmdbufs));
>+#ifndef __SWITCH__
>+    tmp2 = av_realloc_array(cmdbuf->cmdbuf_exts, cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->cmdbuf_exts));
>+    tmp3 = av_realloc_array(cmdbuf->class_ids,   cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->class_ids));
>+#endif
>+
>+#ifndef __SWITCH__
>+    if (!tmp1 || !tmp2 || !tmp3)
>+#else
>+    if (!tmp1)
>+#endif
>+        return AVERROR(ENOMEM);
>+
>+    cmdbuf->cmdbufs = tmp1;
>+
>+#ifndef __SWITCH__
>+    cmdbuf->cmdbuf_exts = tmp2, cmdbuf->class_ids = tmp3;
>+#endif
>+
>+    cmdbuf->cmdbufs[cmdbuf->num_cmdbufs] = (struct nvhost_cmdbuf){
>+        .mem       = av_nvtegra_map_get_handle(cmdbuf->map),
>+        .offset    = (uint8_t *)cmdbuf->cur_word - mem,
>+    };
>+
>+#ifndef __SWITCH__
>+    cmdbuf->cmdbuf_exts[cmdbuf->num_cmdbufs] = (struct nvhost_cmdbuf_ext){
>+        .pre_fence = -1,
>+    };
>+
>+    cmdbuf->class_ids[cmdbuf->num_cmdbufs] = class_id;
>+#endif
>+
>+#ifdef __SWITCH__
>+    if (cmdbuf->num_cmdbufs == 0)
>+        av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_setclass(class_id, 0, 0));
>+#endif
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_end(AVNVTegraCmdbuf *cmdbuf) {
>+    cmdbuf->num_cmdbufs++;
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_push_word(AVNVTegraCmdbuf *cmdbuf, uint32_t word) {
>+    uintptr_t mem_start = (uintptr_t)av_nvtegra_map_get_addr(cmdbuf->map) + cmdbuf->mem_offset;
>+
>+    if ((uintptr_t)cmdbuf->cur_word - mem_start >= cmdbuf->mem_size)
>+        return AVERROR(ENOMEM);
>+
>+    *cmdbuf->cur_word++ = word;
>+    cmdbuf->cmdbufs[cmdbuf->num_cmdbufs].words += 1;
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_push_value(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, uint32_t word) {
>+    int err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_incr(NV_THI_METHOD0>>2, 2));
>+    if (err < 0)
>+        return err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, offset);
>+    if (err < 0)
>+        return err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, word);
>+    if (err < 0)
>+        return err;
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_push_reloc(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, AVNVTegraMap *target, uint32_t target_offset,
>+                                 int reloc_type, int shift)
>+{
>+    int err;
>+#ifndef __SWITCH__
>+    uint8_t *mem;
>+    void *tmp1, *tmp2, *tmp3;
>+
>+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>+
>+    tmp1 = av_realloc_array(cmdbuf->relocs,       cmdbuf->num_relocs + 1, sizeof(*cmdbuf->relocs));
>+    tmp2 = av_realloc_array(cmdbuf->reloc_types,  cmdbuf->num_relocs + 1, sizeof(*cmdbuf->reloc_types));
>+    tmp3 = av_realloc_array(cmdbuf->reloc_shifts, cmdbuf->num_relocs + 1, sizeof(*cmdbuf->reloc_shifts));
>+    if (!tmp1 || !tmp2 || !tmp3)
>+        return AVERROR(ENOMEM);
>+
>+    cmdbuf->relocs = tmp1, cmdbuf->reloc_types = tmp2, cmdbuf->reloc_shifts = tmp3;
>+
>+    err = av_nvtegra_cmdbuf_push_value(cmdbuf, offset, 0xdeadbeef);
>+    if (err < 0)
>+        return err;
>+
>+    cmdbuf->relocs[cmdbuf->num_relocs]       = (struct nvhost_reloc){
>+        .cmdbuf_mem    = av_nvtegra_map_get_handle(cmdbuf->map),
>+        .cmdbuf_offset = (uint8_t *)cmdbuf->cur_word - mem - sizeof(uint32_t),
>+        .target        = av_nvtegra_map_get_handle(target),
>+        .target_offset = target_offset,
>+    };
>+
>+    cmdbuf->reloc_types[cmdbuf->num_relocs]  = (struct nvhost_reloc_type){
>+        .reloc_type    = reloc_type,
>+    };
>+
>+    cmdbuf->reloc_shifts[cmdbuf->num_relocs] = (struct nvhost_reloc_shift){
>+        .shift         = shift,
>+    };
>+
>+    cmdbuf->num_relocs++;
>+
>+    return 0;
>+#else
>+    err = av_nvtegra_cmdbuf_push_value(cmdbuf, offset, (target->iova + target_offset) >> shift);
>+    if (err < 0)
>+        return err;
>+
>+    return 0;
>+#endif
>+}
>+
>+int av_nvtegra_cmdbuf_push_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt) {
>+    int err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_nonincr(NV_THI_INCR_SYNCPT>>2, 1));
>+    if (err < 0)
>+        return err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf,
>+                                      AV_NVTEGRA_VALUE(NV_THI_INCR_SYNCPT, INDX, syncpt) |
>+                                      AV_NVTEGRA_ENUM (NV_THI_INCR_SYNCPT, COND, OP_DONE));
>+    if (err < 0)
>+        return err;
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_push_wait(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence) {
>+    int err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_setclass(HOST1X_CLASS_HOST1X, 0, 0));
>+    if (err < 0)
>+        return err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_mask(NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD>>2,
>+                                      (1<<(NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD - NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD)) |
>+                                      (1<<(NV_CLASS_HOST_WAIT_SYNCPT         - NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD))));
>+    if (err < 0)
>+        return err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, fence);
>+    if (err < 0)
>+        return err;
>+
>+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, syncpt);
>+    if (err < 0)
>+        return err;
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_cmdbuf_add_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence)
>+{
>+    void *tmp1;
>+#ifndef __SWITCH__
>+    void *tmp2;
>+#endif
>+
>+    tmp1 = av_realloc_array(cmdbuf->syncpt_incrs, cmdbuf->num_syncpt_incrs + 1, sizeof(*cmdbuf->syncpt_incrs));
>+#ifndef __SWITCH__
>+    tmp2 = av_realloc_array(cmdbuf->fences,       cmdbuf->num_syncpt_incrs + 1, sizeof(*cmdbuf->fences));
>+#endif
>+
>+#ifndef __SWITCH__
>+    if (!tmp1 || !tmp2)
>+#else
>+    if (!tmp1)
>+#endif
>+        return AVERROR(ENOMEM);
>+
>+    cmdbuf->syncpt_incrs = tmp1;
>+#ifndef __SWITCH__
>+    cmdbuf->fences       = tmp2;
>+#endif
>+
>+    cmdbuf->syncpt_incrs[cmdbuf->num_syncpt_incrs] = (struct nvhost_syncpt_incr){
>+        .syncpt_id    = syncpt,
>+        .syncpt_incrs = 1,
>+    };
>+
>+#ifndef __SWITCH__
>+    cmdbuf->fences[cmdbuf->num_syncpt_incrs]       = fence;
>+#endif
>+
>+    cmdbuf->num_syncpt_incrs++;
>+
>+    return av_nvtegra_cmdbuf_push_syncpt_incr(cmdbuf, syncpt);
>+}
>+
>+int av_nvtegra_cmdbuf_add_waitchk(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence) {
>+#ifndef __SWITCH__
>+    uint8_t *mem;
>+    void *tmp;
>+
>+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>+
>+    tmp = av_realloc_array(cmdbuf->waitchks, cmdbuf->num_waitchks + 1, sizeof(*cmdbuf->waitchks));
>+    if (!tmp)
>+        return AVERROR(ENOMEM);
>+
>+    cmdbuf->waitchks = tmp;
>+
>+    cmdbuf->waitchks[cmdbuf->num_waitchks] = (struct nvhost_waitchk){
>+        .mem       = av_nvtegra_map_get_handle(cmdbuf->map),
>+        .offset    = (uint8_t *)cmdbuf->cur_word - mem - sizeof(uint32_t),
>+        .syncpt_id = syncpt,
>+        .thresh    = fence,
>+    };
>+
>+    cmdbuf->num_waitchks++;
>+#endif
>+
>+    return av_nvtegra_cmdbuf_push_wait(cmdbuf, syncpt, fence);
>+}
>+
>+static void nvtegra_job_free(void *opaque, uint8_t *data) {
>+    AVNVTegraJob *job = (AVNVTegraJob *)data;
>+
>+    if (!job)
>+        return;
>+
>+    av_nvtegra_cmdbuf_deinit(&job->cmdbuf);
>+    av_nvtegra_map_destroy(&job->input_map);
>+
>+    av_freep(&job);
>+}
>+
>+static AVBufferRef *nvtegra_job_alloc(void *opaque, size_t size) {
>+    AVNVTegraJobPool *pool = opaque;
>+
>+    AVBufferRef  *buffer;
>+    AVNVTegraJob *job;
>+    int err;
>+
>+    job = av_mallocz(sizeof(*job));
>+    if (!job)
>+        return NULL;
>+
>+    err = av_nvtegra_map_create(&job->input_map, pool->channel, pool->input_map_size, 0x100,
>+                                NVMAP_HEAP_IOVMM, NVMAP_HANDLE_WRITE_COMBINE);
>+    if (err < 0)
>+        goto fail;
>+
>+    err = av_nvtegra_cmdbuf_init(&job->cmdbuf);
>+    if (err < 0)
>+        goto fail;
>+
>+    err = av_nvtegra_cmdbuf_add_memory(&job->cmdbuf, &job->input_map, pool->cmdbuf_off, pool->max_cmdbuf_size);
>+    if (err < 0)
>+        goto fail;
>+
>+    buffer = av_buffer_create((uint8_t *)job, sizeof(*job), nvtegra_job_free, pool, 0);
>+    if (!buffer)
>+        goto fail;
>+
>+    return buffer;
>+
>+fail:
>+    av_nvtegra_cmdbuf_deinit(&job->cmdbuf);
>+    av_nvtegra_map_destroy(&job->input_map);
>+    av_freep(job);
>+    return NULL;
>+}
>+
>+int av_nvtegra_job_pool_init(AVNVTegraJobPool *pool, AVNVTegraChannel *channel,
>+                             size_t input_map_size, off_t cmdbuf_off, size_t max_cmdbuf_size)
>+{
>+    pool->channel         = channel;
>+    pool->input_map_size  = input_map_size;
>+    pool->cmdbuf_off      = cmdbuf_off;
>+    pool->max_cmdbuf_size = max_cmdbuf_size;
>+    pool->pool            = av_buffer_pool_init2(sizeof(AVNVTegraJob), pool,
>+                                                 nvtegra_job_alloc, NULL);
>+    if (!pool->pool)
>+        return AVERROR(ENOMEM);
>+
>+    return 0;
>+}
>+
>+int av_nvtegra_job_pool_uninit(AVNVTegraJobPool *pool) {
>+    av_buffer_pool_uninit(&pool->pool);
>+    return 0;
>+}
>+
>+AVBufferRef *av_nvtegra_job_pool_get(AVNVTegraJobPool *pool) {
>+    return av_buffer_pool_get(pool->pool);
>+}
>+
>+int av_nvtegra_job_submit(AVNVTegraJobPool *pool, AVNVTegraJob *job) {
>+    return av_nvtegra_channel_submit(pool->channel, &job->cmdbuf, &job->fence);
>+}
>+
>+int av_nvtegra_job_wait(AVNVTegraJobPool *pool, AVNVTegraJob *job, int timeout) {
>+    return av_nvtegra_syncpt_wait(pool->channel, job->fence, timeout);
>+}
>diff --git a/libavutil/nvtegra.h b/libavutil/nvtegra.h
>new file mode 100644
>index 0000000000..3b63335d6c
>--- /dev/null
>+++ b/libavutil/nvtegra.h
>@@ -0,0 +1,258 @@
>+/*
>+ * Copyright (c) 2024 averne <averne381@gmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or modify
>+ * it under the terms of the GNU General Public License as published by
>+ * the Free Software Foundation; either version 2 of the License, or
>+ * (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>+ * GNU General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public License along
>+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
>+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>+ */
>+
>+#ifndef AVUTIL_NVTEGRA_H
>+#define AVUTIL_NVTEGRA_H
>+
>+#include <stdint.h>
>+#include <stdbool.h>
>+
>+#include "buffer.h"
>+
>+#include "nvhost_ioctl.h"
>+#include "nvmap_ioctl.h"
>+
>+typedef struct AVNVTegraChannel {
>+#ifndef __SWITCH__
>+    int fd;
>+    int module_id;
>+#else
>+    NvChannel channel;
>+#endif
>+
>+    uint32_t syncpt;
>+
>+#ifdef __SWITCH__
>+    MmuRequest mmu_request;
>+#endif
>+    uint32_t clock;
>+} AVNVTegraChannel;
>+
>+typedef struct AVNVTegraMap {
>+#ifndef __SWITCH__
>+    uint32_t handle;
>+    uint32_t size;
>+    void *cpu_addr;
>+#else
>+    NvMap map;
>+    uint32_t iova;
>+    uint32_t owner;
>+#endif
>+    bool is_linear;
>+} AVNVTegraMap;
>+
>+typedef struct AVNVTegraCmdbuf {
>+    AVNVTegraMap *map;
>+
>+    uint32_t mem_offset, mem_size;
>+
>+    uint32_t *cur_word;
>+
>+    struct nvhost_cmdbuf       *cmdbufs;
>+#ifndef __SWITCH__
>+    struct nvhost_cmdbuf_ext   *cmdbuf_exts;
>+    uint32_t                   *class_ids;
>+#endif
>+    uint32_t num_cmdbufs;
>+
>+#ifndef __SWITCH__
>+    struct nvhost_reloc        *relocs;
>+    struct nvhost_reloc_type   *reloc_types;
>+    struct nvhost_reloc_shift  *reloc_shifts;
>+    uint32_t num_relocs;
>+#endif
>+
>+    struct nvhost_syncpt_incr  *syncpt_incrs;
>+#ifndef __SWITCH__
>+    uint32_t                   *fences;
>+#endif
>+    uint32_t num_syncpt_incrs;
>+
>+#ifndef __SWITCH__
>+    struct nvhost_waitchk      *waitchks;
>+    uint32_t num_waitchks;
>+#endif
>+} AVNVTegraCmdbuf;
>+
>+typedef struct AVNVTegraJobPool {
>+    /*
>+     * Pool object for job allocation
>+     */
>+    AVBufferPool *pool;
>+
>+    /*
>+     * Hardware channel the jobs will be submitted to
>+     */
>+    AVNVTegraChannel *channel;
>+
>+    /*
>+     * Total size of the input memory-mapped buffer
>+     */
>+    size_t input_map_size;
>+
>+    /*
>+     * Offset of the command data within the input map
>+     */
>+    off_t cmdbuf_off;
>+
>+    /*
>+     * Maximum memory usable by the command buffer
>+     */
>+    size_t max_cmdbuf_size;
>+} AVNVTegraJobPool;
>+
>+typedef struct AVNVTegraJob {
>+    /*
>+     * Memory-mapped buffer for command buffers, metadata structures, ...
>+     */
>+    AVNVTegraMap input_map;
>+
>+    /*
>+     * Object for command recording
>+     */
>+    AVNVTegraCmdbuf cmdbuf;
>+
>+    /*
>+     * Fence indicating completion of the job
>+     */
>+    uint32_t fence;
>+} AVNVTegraJob;
>+
>+AVBufferRef *av_nvtegra_driver_init(void);
>+
>+int av_nvtegra_channel_open(AVNVTegraChannel *channel, const char *dev);
>+int av_nvtegra_channel_close(AVNVTegraChannel *channel);
>+int av_nvtegra_channel_get_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t *clock_rate);
>+int av_nvtegra_channel_set_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t clock_rate);
>+int av_nvtegra_channel_submit(AVNVTegraChannel *channel, AVNVTegraCmdbuf *cmdbuf, uint32_t *fence);
>+int av_nvtegra_channel_set_submit_timeout(AVNVTegraChannel *channel, uint32_t timeout_ms);
>+
>+int av_nvtegra_syncpt_wait(AVNVTegraChannel *channel, uint32_t threshold, int32_t timeout);
>+
>+int av_nvtegra_map_allocate(AVNVTegraMap *map, AVNVTegraChannel *owner, uint32_t size,
>+                            uint32_t align, int heap_mask, int flags);
>+int av_nvtegra_map_free(AVNVTegraMap *map);
>+int av_nvtegra_map_from_va(AVNVTegraMap *map, AVNVTegraChannel *owner, void *mem,
>+                           uint32_t size, uint32_t align, uint32_t flags);
>+int av_nvtegra_map_close(AVNVTegraMap *map);
>+int av_nvtegra_map_map(AVNVTegraMap *map);
>+int av_nvtegra_map_unmap(AVNVTegraMap *map);
>+int av_nvtegra_map_cache_op(AVNVTegraMap *map, int op, void *addr, size_t len);
>+int av_nvtegra_map_realloc(AVNVTegraMap *map, uint32_t size, uint32_t align, int heap_mask, int flags);
>+
>+static inline int av_nvtegra_map_create(AVNVTegraMap *map, AVNVTegraChannel *owner, uint32_t size, uint32_t align,
>+                                        int heap_mask, int flags)
>+{
>+    int err;
>+
>+    err = av_nvtegra_map_allocate(map, owner, size, align, heap_mask, flags);
>+    if (err < 0)
>+        return err;
>+
>+    return av_nvtegra_map_map(map);
>+}
>+
>+static inline int av_nvtegra_map_destroy(AVNVTegraMap *map) {
>+    int err;
>+
>+    err = av_nvtegra_map_unmap(map);
>+    if (err < 0)
>+        return err;
>+
>+    return av_nvtegra_map_free(map);
>+}
>+
>+int av_nvtegra_cmdbuf_init(AVNVTegraCmdbuf *cmdbuf);
>+int av_nvtegra_cmdbuf_deinit(AVNVTegraCmdbuf *cmdbuf);
>+int av_nvtegra_cmdbuf_add_memory(AVNVTegraCmdbuf *cmdbuf, AVNVTegraMap *map, uint32_t offset, uint32_t size);
>+int av_nvtegra_cmdbuf_clear(AVNVTegraCmdbuf *cmdbuf);
>+int av_nvtegra_cmdbuf_begin(AVNVTegraCmdbuf *cmdbuf, uint32_t class_id);
>+int av_nvtegra_cmdbuf_end(AVNVTegraCmdbuf *cmdbuf);
>+int av_nvtegra_cmdbuf_push_word(AVNVTegraCmdbuf *cmdbuf, uint32_t word);
>+int av_nvtegra_cmdbuf_push_value(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, uint32_t word);
>+int av_nvtegra_cmdbuf_push_reloc(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, AVNVTegraMap *target, uint32_t target_offset,
>+                                 int reloc_type, int shift);
>+int av_nvtegra_cmdbuf_push_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt);
>+int av_nvtegra_cmdbuf_push_wait(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
>+int av_nvtegra_cmdbuf_add_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
>+int av_nvtegra_cmdbuf_add_waitchk(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
>+
>+/*
>+ * Job allocation and submission routines
>+ */
>+int av_nvtegra_job_pool_init(AVNVTegraJobPool *pool, AVNVTegraChannel *channel,
>+                             size_t input_map_size, off_t cmdbuf_off, size_t max_cmdbuf_size);
>+int av_nvtegra_job_pool_uninit(AVNVTegraJobPool *pool);
>+AVBufferRef *av_nvtegra_job_pool_get(AVNVTegraJobPool *pool);
>+
>+int av_nvtegra_job_submit(AVNVTegraJobPool *pool, AVNVTegraJob *job);
>+int av_nvtegra_job_wait(AVNVTegraJobPool *pool, AVNVTegraJob *job, int timeout);
>+
>+static inline uint32_t av_nvtegra_map_get_handle(AVNVTegraMap *map) {
>+#ifndef __SWITCH__
>+    return map->handle;
>+#else
>+    return map->map.handle;
>+#endif
>+}
>+
>+static inline void *av_nvtegra_map_get_addr(AVNVTegraMap *map) {
>+#ifndef __SWITCH__
>+    return map->cpu_addr;
>+#else
>+    return map->map.cpu_addr;
>+#endif
>+}
>+
>+static inline uint32_t av_nvtegra_map_get_size(AVNVTegraMap *map) {
>+#ifndef __SWITCH__
>+    return map->size;
>+#else
>+    return map->map.size;
>+#endif
>+}
>+
>+/* Addresses are shifted by 8 bits in the command buffer, requiring an alignment to 256 */
>+#define AV_NVTEGRA_MAP_ALIGN (1 << 8)
>+
>+#define AV_NVTEGRA_VALUE(offset, field, value)                                                    \
>+    ((value &                                                                                     \
>+    ((uint32_t)((UINT64_C(1) << ((1?offset ## _ ## field) - (0?offset ## _ ## field) + 1)) - 1))) \
>+    << (0?offset ## _ ## field))
>+
>+#define AV_NVTEGRA_ENUM(offset, field, value)                                                     \
>+    ((offset ## _ ## field ## _ ## value &                                                        \
>+    ((uint32_t)((UINT64_C(1) << ((1?offset ## _ ## field) - (0?offset ## _ ## field) + 1)) - 1))) \
>+    << (0?offset ## _ ## field))
>+
>+#define AV_NVTEGRA_PUSH_VALUE(cmdbuf, offset, value) ({                                  \
>+    int _err = av_nvtegra_cmdbuf_push_value(cmdbuf, (offset) / sizeof(uint32_t), value); \
>+    if (_err < 0)                                                                        \
>+        return _err;                                                                     \
>+})
>+
>+#define AV_NVTEGRA_PUSH_RELOC(cmdbuf, offset, target, target_offset, type) ({    \
>+    int _err = av_nvtegra_cmdbuf_push_reloc(cmdbuf, (offset) / sizeof(uint32_t), \
>+                                            target, target_offset, type, 8);     \
>+    if (_err < 0)                                                                \
>+        return _err;                                                             \
>+})
>+
>+#endif /* AVUTIL_NVTEGRA_H */
>diff --git a/libavutil/nvtegra_host1x.h b/libavutil/nvtegra_host1x.h
>new file mode 100644
>index 0000000000..25e37eae61
>--- /dev/null
>+++ b/libavutil/nvtegra_host1x.h
>@@ -0,0 +1,94 @@
>+/*
>+ * Copyright (c) 2024 averne <averne381@gmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or modify
>+ * it under the terms of the GNU General Public License as published by
>+ * the Free Software Foundation; either version 2 of the License, or
>+ * (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>+ * GNU General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public License along
>+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
>+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>+ */
>+
>+#ifndef AVUTIL_NVTEGRA_HOST1X_H
>+#define AVUTIL_NVTEGRA_HOST1X_H
>+
>+#include <stdint.h>
>+
>+#include "macros.h"
>+
>+/* From L4T include/linux/host1x.h */
>+enum host1x_class {
>+    HOST1X_CLASS_HOST1X  = 0x01,
>+    HOST1X_CLASS_NVENC   = 0x21,
>+    HOST1X_CLASS_VI      = 0x30,
>+    HOST1X_CLASS_ISPA    = 0x32,
>+    HOST1X_CLASS_ISPB    = 0x34,
>+    HOST1X_CLASS_GR2D    = 0x51,
>+    HOST1X_CLASS_GR2D_SB = 0x52,
>+    HOST1X_CLASS_VIC     = 0x5d,
>+    HOST1X_CLASS_GR3D    = 0x60,
>+    HOST1X_CLASS_NVJPG   = 0xc0,
>+    HOST1X_CLASS_NVDEC   = 0xf0,
>+};
>+
>+static inline uint32_t host1x_opcode_setclass(unsigned class_id, unsigned offset, unsigned mask) {
>+    return (0 << 28) | (offset << 16) | (class_id << 6) | mask;
>+}
>+
>+static inline uint32_t host1x_opcode_incr(unsigned offset, unsigned count) {
>+    return (1 << 28) | (offset << 16) | count;
>+}
>+
>+static inline uint32_t host1x_opcode_nonincr(unsigned offset, unsigned count) {
>+    return (2 << 28) | (offset << 16) | count;
>+}
>+
>+static inline uint32_t host1x_opcode_mask(unsigned offset, unsigned mask) {
>+    return (3 << 28) | (offset << 16) | mask;
>+}
>+
>+static inline uint32_t host1x_opcode_imm(unsigned offset, unsigned value) {
>+    return (4 << 28) | (offset << 16) | value;
>+}
>+
>+#define NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD                                  (0x00000138)
>+#define NV_CLASS_HOST_WAIT_SYNCPT                                          (0x00000140)
>+
>+#define NV_THI_INCR_SYNCPT                                                 (0x00000000)
>+#define NV_THI_INCR_SYNCPT_INDX                                            7:0
>+#define NV_THI_INCR_SYNCPT_COND                                            15:8
>+#define NV_THI_INCR_SYNCPT_COND_IMMEDIATE                                  (0x00000000)
>+#define NV_THI_INCR_SYNCPT_COND_OP_DONE                                    (0x00000001)
>+#define NV_THI_INCR_SYNCPT_ERR                                             (0x00000008)
>+#define NV_THI_INCR_SYNCPT_ERR_COND_STS_IMM                                0:0
>+#define NV_THI_INCR_SYNCPT_ERR_COND_STS_OPDONE                             1:1
>+#define NV_THI_CTXSW_INCR_SYNCPT                                           (0x0000000c)
>+#define NV_THI_CTXSW_INCR_SYNCPT_INDX                                      7:0
>+#define NV_THI_CTXSW                                                       (0x00000020)
>+#define NV_THI_CTXSW_CURR_CLASS                                            9:0
>+#define NV_THI_CTXSW_AUTO_ACK                                              11:11
>+#define NV_THI_CTXSW_CURR_CHANNEL                                          15:12
>+#define NV_THI_CTXSW_NEXT_CLASS                                            25:16
>+#define NV_THI_CTXSW_NEXT_CHANNEL                                          31:28
>+#define NV_THI_CONT_SYNCPT_EOF                                             (0x00000028)
>+#define NV_THI_CONT_SYNCPT_EOF_INDEX                                       7:0
>+#define NV_THI_CONT_SYNCPT_EOF_COND                                        8:8
>+#define NV_THI_METHOD0                                                     (0x00000040)
>+#define NV_THI_METHOD0_OFFSET                                              11:0
>+#define NV_THI_METHOD1                                                     (0x00000044)
>+#define NV_THI_METHOD1_DATA                                                31:0
>+#define NV_THI_INT_STATUS                                                  (0x00000078)
>+#define NV_THI_INT_STATUS_FALCON_INT                                       0:0
>+#define NV_THI_INT_MASK                                                    (0x0000007c)
>+#define NV_THI_INT_MASK_FALCON_INT                                         0:0
>+
>+#endif /* AVUTIL_NVTEGRA_HOST1X_H */
>diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
>index 1c0bcf2232..bb14b1b306 100644
>--- a/libavutil/pixdesc.c
>+++ b/libavutil/pixdesc.c
>@@ -2791,6 +2791,10 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
>         },
>         .flags = AV_PIX_FMT_FLAG_PLANAR,
>     },
>+    [AV_PIX_FMT_NVTEGRA] = {
>+        .name = "nvtegra",
>+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
>+    },
> };
> 
> static const char * const color_range_names[] = {
>diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
>index a7f50e1690..a3213c792a 100644
>--- a/libavutil/pixfmt.h
>+++ b/libavutil/pixfmt.h
>@@ -439,6 +439,14 @@ enum AVPixelFormat {
>      */
>     AV_PIX_FMT_D3D12,
> 
>+    /**
>+     * Hardware surfaces for Tegra devices.
>+     *
>+     * data[0..2] points to memory-mapped buffer containing frame data
>+     * buf[0] contains an AVBufferRef to an AVNTegraMap
>+     */
>+    AV_PIX_FMT_NVTEGRA,
>+
>     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
> };
>
averne May 31, 2024, 9:06 p.m. UTC | #2
Le 31/05/2024 à 10:32, Rémi Denis-Courmont a écrit :
> 
> 
> Le 30 mai 2024 22:43:07 GMT+03:00, averne <averne381@gmail.com> a écrit :
>> This includes a new pixel format for nvtegra hardware frames, and several objects for interaction with hardware blocks.
>> In particular, this contains code for channels (handles to hardware engines), maps (memory-mapped buffers shared with engines), and command buffers (abstraction for building command lists sent to the engines).
>>
>> Signed-off-by: averne <averne381@gmail.com>
>> ---
>> configure                  |    2 +
>> libavutil/Makefile         |    4 +
>> libavutil/nvtegra.c        | 1035 ++++++++++++++++++++++++++++++++++++
>> libavutil/nvtegra.h        |  258 +++++++++
>> libavutil/nvtegra_host1x.h |   94 ++++
>> libavutil/pixdesc.c        |    4 +
>> libavutil/pixfmt.h         |    8 +
>> 7 files changed, 1405 insertions(+)
>> create mode 100644 libavutil/nvtegra.c
>> create mode 100644 libavutil/nvtegra.h
>> create mode 100644 libavutil/nvtegra_host1x.h
>>
>> diff --git a/configure b/configure
>> index 09fb2aed1b..51f169bfbd 100755
>> --- a/configure
>> +++ b/configure
>> @@ -361,6 +361,7 @@ External library support:
>>   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
>>   --disable-videotoolbox   disable VideoToolbox code [autodetect]
>>   --disable-vulkan         disable Vulkan code [autodetect]
>> +  --enable-nvtegra         enable nvtegra code [no]
>>
>> Toolchain options:
>>   --arch=ARCH              select architecture [$arch]
>> @@ -3151,6 +3152,7 @@ videotoolbox_hwaccel_deps="videotoolbox pthreads"
>> videotoolbox_hwaccel_extralibs="-framework QuartzCore"
>> vulkan_deps="threads"
>> vulkan_deps_any="libdl LoadLibrary"
>> +nvtegra_deps="gpl"
>>
>> av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
>> av1_d3d11va_hwaccel_select="av1_decoder"
>> diff --git a/libavutil/Makefile b/libavutil/Makefile
>> index 9c112bc58a..733a23a8a3 100644
>> --- a/libavutil/Makefile
>> +++ b/libavutil/Makefile
>> @@ -52,6 +52,7 @@ HEADERS = adler32.h                                                     \
>>           hwcontext_videotoolbox.h                                      \
>>           hwcontext_vdpau.h                                             \
>>           hwcontext_vulkan.h                                            \
>> +          nvtegra.h                                                     \
>>           nvhost_ioctl.h                                                \
>>           nvmap_ioctl.h                                                 \
>>           iamf.h                                                        \
>> @@ -209,6 +210,7 @@ OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
>> OBJS-$(CONFIG_VULKAN)                   += hwcontext_vulkan.o vulkan.o
>>
>> OBJS-$(!CONFIG_VULKAN)                  += hwcontext_stub.o
>> +OBJS-$(CONFIG_NVTEGRA)                  += nvtegra.o
>>
>> OBJS += $(COMPAT_OBJS:%=../compat/%)
>>
>> @@ -230,6 +232,8 @@ SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
>> SKIPHEADERS-$(CONFIG_VULKAN)           += hwcontext_vulkan.h vulkan.h   \
>>                                           vulkan_functions.h            \
>>                                           vulkan_loader.h
>> +SKIPHEADERS-$(CONFIG_NVTEGRA)          += nvtegra.h                     \
>> +                                          nvtegra_host1x.h
>>
>> TESTPROGS = adler32                                                     \
>>             aes                                                         \
>> diff --git a/libavutil/nvtegra.c b/libavutil/nvtegra.c
>> new file mode 100644
>> index 0000000000..ad0bbbdfaa
>> --- /dev/null
>> +++ b/libavutil/nvtegra.c
>> @@ -0,0 +1,1035 @@
>> +/*
>> + * Copyright (c) 2024 averne <averne381@gmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
>> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>> + */
>> +
>> +#ifndef __SWITCH__
>> +#   include <sys/ioctl.h>
>> +#   include <sys/mman.h>
>> +#   include <fcntl.h>
>> +#   include <unistd.h>
>> +#else
>> +#   include <stdlib.h>
>> +#   include <switch.h>
>> +#endif
>> +
>> +#include <string.h>
>> +
>> +#include "buffer.h"
>> +#include "log.h"
>> +#include "error.h"
>> +#include "mem.h"
>> +#include "thread.h"
>> +
>> +#include "nvhost_ioctl.h"
>> +#include "nvmap_ioctl.h"
>> +#include "nvtegra_host1x.h"
>> +
>> +#include "nvtegra.h"
>> +
>> +/*
>> + * Tag used by the kernel to identify allocations.
>> + * Official software has been seen using 0x900, 0xf00, 0x1100, 0x1400, 0x4000.
>> + */
>> +#define MEM_TAG (0xfeed)
>> +
>> +struct DriverState {
>> +    int nvmap_fd, nvhost_fd;
>> +};
>> +
>> +static AVMutex g_driver_init_mtx = AV_MUTEX_INITIALIZER;
>> +static struct DriverState *g_driver_state = NULL;
>> +static AVBufferRef *g_driver_state_ref = NULL;
>> +
>> +static void free_driver_fds(void *opaque, uint8_t *data) {
>> +    if (!g_driver_state)
>> +        return;
>> +
>> +#ifndef __SWITCH__
>> +    if (g_driver_state->nvmap_fd > 0)
>> +        close(g_driver_state->nvmap_fd);
>> +
>> +    if (g_driver_state->nvhost_fd > 0)
>> +        close(g_driver_state->nvhost_fd);
>> +#else
>> +    nvFenceExit();
>> +    nvMapExit();
>> +    nvExit();
>> +    mmuExit();
>> +#endif
>> +
>> +    g_driver_init_mtx  = (AVMutex)AV_MUTEX_INITIALIZER;
>> +    g_driver_state_ref = NULL;
>> +    av_freep(&g_driver_state);
>> +}
>> +
>> +static int init_driver_fds(void) {
>> +    AVBufferRef *ref;
>> +    struct DriverState *state;
>> +    int err;
>> +
>> +    state = av_mallocz(sizeof(*state));
>> +    if (!state)
>> +        return AVERROR(ENOMEM);
>> +
>> +    ref = av_buffer_create((uint8_t *)state, sizeof(*state), free_driver_fds, NULL, 0);
>> +    if (!state)
>> +        return AVERROR(ENOMEM);
>> +
>> +    g_driver_state     = state;
>> +    g_driver_state_ref = ref;
>> +
>> +#ifndef __SWITCH__
>> +    err = open("/dev/nvmap", O_RDWR | O_SYNC);
> 
> There's helpers to open files, and you're missing the close on exec here. Also not clear why you need O_SYNC.
> 
> But did you consider just reimplementing libnvdec instead of putting the device driver directly in FFmpeg?
> 

I checked and official code uses O_RDWR|O_SYNC|O_CLOEXEC for 
/dev/nvhost-ctrl and /dev/nvmap, then O_RDWR|O_CLOEXEC for 
/dev/nvhost-vic and /dev/nvhost-nvdec.
I don't believe O_SYNC is required but I think it's good
practice to reproduce offical behavior when possible. I'll 
switch everything O_RDWR|O_SYNC|O_CLOEXEC.

As for your second question, I probably should've given some
context about this decision. Initially I thought about writing a
vaapi driver, but for a number of reasons I decided against it.
- First, the Switch is a performance-constrained device, so removing
  abstraction layers frees up CPU time and memory accesses.
  Integrating directly into FFmpeg enables some optimizations, for 
  instance bitstream data is never copied to a staging buffer, but
  written directly to the memory-mapped buffer that will be fed to the
  hardware.
  There are also some codecs that need information not given in vaapi 
  structures (see for instance sw_hdr_skip_length in the HEVC code),
  so it would require re-parsing slice headers. Likewise, in NVDEC
  the VP9 entropy context isn't managed in hardware/microcode, so the
  vaapi implementation would need to duplicate work.
- Second, a vaapi driver honestly seemed like an enormous amount of
  work, on top of all the reverse engineering efforts, I would need to
  make FFmpeg (and later mpv) happy about my implementation. 
- Third, I wasn't certain I would be able to implement zero-copy 
  frame imports in my graphics context. The goal was to use deko3d
  (https://github.com/devkitPro/deko3d), an efficient homebrew graphics
  API for the Switch, which needs CPU addresses to import external 
  buffers.

>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +    state->nvmap_fd = err;
>> +
>> +    err = open("/dev/nvhost-ctrl", O_RDWR | O_SYNC);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +    state->nvhost_fd = err;
>> +#else
>> +    err = nvInitialize();
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +
>> +    err = nvMapInit();
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +    state->nvmap_fd = nvMapGetFd();
>> +
>> +    err = nvFenceInit();
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +    /* libnx doesn't export the nvhost-ctrl file descriptor */
>> +
>> +    err = mmuInitialize();
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +#endif
>> +
>> +    return 0;
>> +}
>> +
>> +static inline int get_nvmap_fd(void) {
>> +    if (!g_driver_state)
>> +        return AVERROR_UNKNOWN;
>> +
>> +    if (!g_driver_state->nvmap_fd)
>> +        return AVERROR_UNKNOWN;
>> +
>> +    return g_driver_state->nvmap_fd;
>> +}
>> +
>> +static inline int get_nvhost_fd(void) {
>> +    if (!g_driver_state)
>> +        return AVERROR_UNKNOWN;
>> +
>> +    if (!g_driver_state->nvhost_fd)
>> +        return AVERROR_UNKNOWN;
>> +
>> +    return g_driver_state->nvhost_fd;
>> +}
>> +
>> +AVBufferRef *av_nvtegra_driver_init(void) {
>> +    AVBufferRef *out = NULL;
>> +    int err;
>> +
>> +    /*
>> +     * We have to do this overly complex dance of putting driver fds in a refcounted struct,
>> +     * otherwise initializing multiple hwcontexts would leak fds
>> +     */
>> +
>> +    err = ff_mutex_lock(&g_driver_init_mtx);
>> +    if (err != 0)
>> +        goto exit;
>> +
>> +    if (g_driver_state_ref) {
>> +        out = av_buffer_ref(g_driver_state_ref);
>> +        goto exit;
>> +    }
>> +
>> +    err = init_driver_fds();
>> +    if (err < 0) {
>> +        /* In case memory allocations failed, call the destructor ourselves */
>> +        av_buffer_unref(&g_driver_state_ref);
>> +        free_driver_fds(NULL, NULL);
>> +        goto exit;
>> +    }
>> +
>> +    out = g_driver_state_ref;
>> +
>> +exit:
>> +    ff_mutex_unlock(&g_driver_init_mtx);
>> +    return out;
>> +}
>> +
>> +int av_nvtegra_channel_open(AVNVTegraChannel *channel, const char *dev) {
>> +    int err;
>> +#ifndef __SWITCH__
>> +    struct nvhost_get_param_arg args;
>> +
>> +    err = open(dev, O_RDWR);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    channel->fd = err;
>> +
>> +    args = (struct nvhost_get_param_arg){0};
>> +
>> +    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_GET_SYNCPOINT, &args);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    channel->syncpt = args.value;
>> +
>> +    return 0;
>> +
>> +fail:
>> +    close(channel->fd);
>> +    return AVERROR(errno);
>> +#else
>> +    err = nvChannelCreate(&channel->channel, dev);
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +
>> +    err = nvioctlChannel_GetSyncpt(channel->channel.fd, 0, &channel->syncpt);
>> +    if (R_FAILED(err))
>> +        goto fail;
>> +
>> +    return 0;
>> +
>> +fail:
>> +    nvChannelClose(&channel->channel);
>> +    return AVERROR(err);
>> +#endif
>> +}
>> +
>> +int av_nvtegra_channel_close(AVNVTegraChannel *channel) {
>> +#ifndef __SWITCH__
>> +    if (!channel->fd)
>> +        return 0;
>> +
>> +    return close(channel->fd);
>> +#else
>> +    nvChannelClose(&channel->channel);
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_channel_get_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t *clock_rate) {
>> +    int err;
>> +#ifndef __SWITCH__
>> +    struct nvhost_clk_rate_args args;
>> +
>> +    args = (struct nvhost_clk_rate_args){
>> +        .moduleid = moduleid,
>> +    };
>> +
>> +    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_GET_CLK_RATE, &args);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    if (clock_rate)
>> +        *clock_rate = args.rate;
>> +
>> +    return 0;
>> +#else
>> +    uint32_t tmp;
>> +
>> +    err = AVERROR(nvioctlChannel_GetModuleClockRate(channel->channel.fd, moduleid, &tmp));
>> +    if (err < 0)
>> +        return err;
>> +
>> +    if (clock_rate)
>> +        *clock_rate = tmp * 1000;
>> +
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_channel_set_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t clock_rate) {
>> +#ifndef __SWITCH__
>> +    struct nvhost_clk_rate_args args;
>> +
>> +    args = (struct nvhost_clk_rate_args){
>> +        .rate     = clock_rate,
>> +        .moduleid = moduleid,
>> +    };
>> +
>> +    return (ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SET_CLK_RATE, &args) < 0) ? AVERROR(errno) : 0;
>> +#else
>> +    return AVERROR(nvioctlChannel_SetModuleClockRate(channel->channel.fd, moduleid, clock_rate / 1000));
>> +#endif
>> +}
>> +
>> +int av_nvtegra_channel_submit(AVNVTegraChannel *channel, AVNVTegraCmdbuf *cmdbuf, uint32_t *fence) {
>> +    int err;
>> +#ifndef __SWITCH__
>> +    struct nvhost_submit_args args;
>> +
>> +    args = (struct nvhost_submit_args){
>> +        .submit_version          = NVHOST_SUBMIT_VERSION_V2,
>> +        .num_syncpt_incrs        = cmdbuf->num_syncpt_incrs,
>> +        .num_cmdbufs             = cmdbuf->num_cmdbufs,
>> +        .num_relocs              = cmdbuf->num_relocs,
>> +        .num_waitchks            = cmdbuf->num_waitchks,
>> +        .timeout                 = 0,
>> +        .flags                   = 0,
>> +        .fence                   = 0,
>> +        .syncpt_incrs            = (uintptr_t)cmdbuf->syncpt_incrs,
>> +        .cmdbuf_exts             = (uintptr_t)cmdbuf->cmdbuf_exts,
>> +        .checksum_methods        = 0,
>> +        .checksum_falcon_methods = 0,
>> +        .pad                     = { 0 },
>> +        .reloc_types             = (uintptr_t)cmdbuf->reloc_types,
>> +        .cmdbufs                 = (uintptr_t)cmdbuf->cmdbufs,
>> +        .relocs                  = (uintptr_t)cmdbuf->relocs,
>> +        .reloc_shifts            = (uintptr_t)cmdbuf->reloc_shifts,
>> +        .waitchks                = (uintptr_t)cmdbuf->waitchks,
>> +        .waitbases               = 0,
>> +        .class_ids               = (uintptr_t)cmdbuf->class_ids,
>> +        .fences                  = (uintptr_t)cmdbuf->fences,
>> +    };
>> +
>> +    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SUBMIT, &args);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    if (fence)
>> +        *fence = args.fence;
>> +
>> +    return 0;
>> +#else
>> +    nvioctl_fence tmp;
>> +
>> +    err = nvioctlChannel_Submit(channel->channel.fd, (nvioctl_cmdbuf *)cmdbuf->cmdbufs, cmdbuf->num_cmdbufs,
>> +                                NULL, NULL, 0, (nvioctl_syncpt_incr *)cmdbuf->syncpt_incrs, cmdbuf->num_syncpt_incrs,
>> +                                &tmp, 1);
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +
>> +    if (fence)
>> +        *fence = tmp.value;
>> +
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_channel_set_submit_timeout(AVNVTegraChannel *channel, uint32_t timeout_ms) {
>> +#ifndef __SWITCH__
>> +    struct nvhost_set_timeout_args args;
>> +
>> +    args = (struct nvhost_set_timeout_args){
>> +        .timeout = timeout_ms,
>> +    };
>> +
>> +    return (ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SET_TIMEOUT, &args) < 0) ? AVERROR(errno) : 0;
>> +#else
>> +    return AVERROR(nvioctlChannel_SetSubmitTimeout(channel->channel.fd, timeout_ms));
>> +#endif
>> +}
>> +
>> +int av_nvtegra_syncpt_wait(AVNVTegraChannel *channel, uint32_t threshold, int32_t timeout) {
>> +#ifndef __SWITCH__
>> +    struct nvhost_ctrl_syncpt_waitex_args args = {
>> +        .id      = channel->syncpt,
>> +        .thresh  = threshold,
>> +        .timeout = timeout,
>> +    };
>> +
>> +    return (ioctl(get_nvhost_fd(), NVHOST_IOCTL_CTRL_SYNCPT_WAITEX, &args) < 0) ? AVERROR(errno) : 0;
>> +#else
>> +    NvFence fence;
>> +
>> +    fence = (NvFence){
>> +        .id    = channel->syncpt,
>> +        .value = threshold,
>> +    };
>> +
>> +    return AVERROR(nvFenceWait(&fence, timeout));
>> +#endif
>> +}
>> +
>> +#ifdef __SWITCH__
>> +static inline bool convert_cache_flags(uint32_t flags) {
>> +    /* Return whether the map should be CPU-cacheable */
>> +    switch (flags & NVMAP_HANDLE_CACHE_FLAG) {
>> +        case NVMAP_HANDLE_INNER_CACHEABLE:
>> +        case NVMAP_HANDLE_CACHEABLE:
>> +            return true;
>> +        default:
>> +            return false;
>> +    }
>> +}
>> +#endif
>> +
>> +int av_nvtegra_map_allocate(AVNVTegraMap *map, AVNVTegraChannel *channel, uint32_t size,
>> +                            uint32_t align, int heap_mask, int flags)
>> +{
>> +#ifndef __SWITCH__
>> +    struct nvmap_create_handle create_args;
>> +    struct nvmap_alloc_handle alloc_args;
>> +    int err;
>> +
>> +    create_args = (struct nvmap_create_handle){
>> +        .size   = size,
>> +    };
>> +
>> +    err = ioctl(get_nvmap_fd(), NVMAP_IOC_CREATE, &create_args);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    map->size   = size;
>> +    map->handle = create_args.handle;
>> +
>> +    alloc_args = (struct nvmap_alloc_handle){
>> +        .handle    = create_args.handle,
>> +        .heap_mask = heap_mask,
>> +        .flags     = flags | (MEM_TAG << 16),
>> +        .align     = align,
>> +    };
>> +
>> +    err = ioctl(get_nvmap_fd(), NVMAP_IOC_ALLOC, &alloc_args);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    return 0;
>> +
>> +fail:
>> +    av_nvtegra_map_free(map);
>> +    return AVERROR(errno);
>> +#else
>> +    void *mem;
>> +
>> +    map->owner = channel->channel.fd;
>> +
>> +    size = FFALIGN(size, 0x1000);
>> +
>> +    mem = aligned_alloc(FFALIGN(align, 0x1000), size);
>> +    if (!mem)
>> +        return AVERROR(ENOMEM);
>> +
>> +    return AVERROR(nvMapCreate(&map->map, mem, size, 0x10000, NvKind_Pitch,
>> +                               convert_cache_flags(flags)));
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_free(AVNVTegraMap *map) {
>> +#ifndef __SWITCH__
>> +    int err;
>> +
>> +    if (!map->handle)
>> +        return 0;
>> +
>> +    err = ioctl(get_nvmap_fd(), NVMAP_IOC_FREE, map->handle);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    map->handle = 0;
>> +
>> +    return 0;
>> +#else
>> +    void *addr = map->map.cpu_addr;
>> +
>> +    if (!map->map.cpu_addr)
>> +        return 0;
>> +
>> +    nvMapClose(&map->map);
>> +    free(addr);
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_from_va(AVNVTegraMap *map, AVNVTegraChannel *owner, void *mem,
>> +                           uint32_t size, uint32_t align, uint32_t flags)
>> +{
>> +#ifndef __SWITCH__
>> +    struct nvmap_create_handle_from_va args;
>> +    int err;
>> +
>> +    args = (struct nvmap_create_handle_from_va){
>> +        .va    = (uintptr_t)mem,
>> +        .size  = size,
>> +        .flags = flags | (MEM_TAG << 16),
>> +    };
>> +
>> +    err = ioctl(get_nvmap_fd(), NVMAP_IOC_FROM_VA, &args);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    map->cpu_addr = mem;
>> +    map->size     = size;
>> +    map->handle   = args.handle;
>> +
>> +    return 0;
>> +#else
>> +
>> +    map->owner = owner->channel.fd;
>> +
>> +    return AVERROR(nvMapCreate(&map->map, mem, FFALIGN(size, 0x1000), 0x10000, NvKind_Pitch,
>> +                               convert_cache_flags(flags)));;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_close(AVNVTegraMap *map) {
>> +#ifndef __SWITCH__
>> +    return av_nvtegra_map_free(map);
>> +#else
>> +    nvMapClose(&map->map);
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_map(AVNVTegraMap *map) {
>> +#ifndef __SWITCH__
>> +    void *addr;
>> +
>> +    addr = mmap(NULL, map->size, PROT_READ | PROT_WRITE, MAP_SHARED, map->handle, 0);
>> +    if (addr == MAP_FAILED)
>> +        return AVERROR(errno);
>> +
>> +    map->cpu_addr = addr;
>> +
>> +    return 0;
>> +#else
>> +    nvioctl_command_buffer_map params;
>> +    int err;
>> +
>> +    params = (nvioctl_command_buffer_map){
>> +        .handle = map->map.handle,
>> +    };
>> +
>> +    err = nvioctlChannel_MapCommandBuffer(map->owner, &params, 1, false);
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +
>> +    map->iova = params.iova;
>> +
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_unmap(AVNVTegraMap *map) {
>> +    int err;
>> +#ifndef __SWITCH__
>> +    if (!map->cpu_addr)
>> +        return 0;
>> +
>> +    err = munmap(map->cpu_addr, map->size);
>> +    if (err < 0)
>> +        return AVERROR(errno);
>> +
>> +    map->cpu_addr = NULL;
>> +
>> +    return 0;
>> +#else
>> +    nvioctl_command_buffer_map params;
>> +
>> +    if (!map->iova)
>> +        return 0;
>> +
>> +    params = (nvioctl_command_buffer_map){
>> +        .handle = map->map.handle,
>> +        .iova   = map->iova,
>> +    };
>> +
>> +    err = nvioctlChannel_UnmapCommandBuffer(map->owner, &params, 1, false);
>> +    if (R_FAILED(err))
>> +        return AVERROR(err);
>> +
>> +    map->iova = 0;
>> +
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_cache_op(AVNVTegraMap *map, int op, void *addr, size_t len) {
>> +#ifndef __SWITCH__
>> +    struct nvmap_cache_op args;
>> +
>> +    args = (struct nvmap_cache_op){
>> +        .addr   = (uintptr_t)addr,
>> +        .len    = len,
>> +        .handle = av_nvtegra_map_get_handle(map),
>> +        .op     = op,
>> +    };
>> +
>> +    return AVERROR(ioctl(get_nvmap_fd(), NVMAP_IOC_CACHE, &args));
>> +#else
>> +    if (!map->map.is_cpu_cacheable)
>> +        return 0;
>> +
>> +    switch (op) {
>> +        case NVMAP_CACHE_OP_WB:
>> +            armDCacheClean(addr, len);
>> +            break;
>> +        default:
>> +        case NVMAP_CACHE_OP_INV:
>> +        case NVMAP_CACHE_OP_WB_INV:
>> +            /* libnx internally performs a clean-invalidate, since invalidate is a privileged instruction */
>> +            armDCacheFlush(addr, len);
>> +            break;
>> +    }
>> +
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_map_realloc(AVNVTegraMap *map, uint32_t size, uint32_t align,
>> +                           int heap_mask, int flags)
>> +{
>> +    AVNVTegraChannel channel;
>> +    AVNVTegraMap tmp = {0};
>> +    int err;
>> +
>> +    if (av_nvtegra_map_get_size(map) >= size)
>> +        return 0;
>> +
>> +    /* Dummy channel object to hold the owner fd */
>> +    channel = (AVNVTegraChannel){
>> +#ifdef __SWITCH__
>> +        .channel.fd = map->owner,
>> +#endif
>> +    };
>> +
>> +    err = av_nvtegra_map_create(&tmp, &channel, size, align, heap_mask, flags);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    memcpy(av_nvtegra_map_get_addr(&tmp), av_nvtegra_map_get_addr(map), av_nvtegra_map_get_size(map));
>> +
>> +    err = av_nvtegra_map_destroy(map);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    *map = tmp;
>> +
>> +    return 0;
>> +
>> +fail:
>> +    av_nvtegra_map_destroy(&tmp);
>> +    return err;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_init(AVNVTegraCmdbuf *cmdbuf) {
>> +    cmdbuf->num_cmdbufs      = 0;
>> +#ifndef __SWITCH__
>> +    cmdbuf->num_relocs       = 0;
>> +    cmdbuf->num_waitchks     = 0;
>> +#endif
>> +    cmdbuf->num_syncpt_incrs = 0;
>> +
>> +#define NUM_INITIAL_CMDBUFS      3
>> +#define NUM_INITIAL_RELOCS       15
>> +#define NUM_INITIAL_SYNCPT_INCRS 3
>> +
>> +    cmdbuf->cmdbufs      = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->cmdbufs));
>> +#ifndef __SWITCH__
>> +    cmdbuf->cmdbuf_exts  = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->cmdbuf_exts));
>> +    cmdbuf->class_ids    = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->class_ids));
>> +#endif
>> +
>> +#ifndef __SWITCH__
>> +    if (!cmdbuf->cmdbufs || !cmdbuf->cmdbuf_exts || !cmdbuf->class_ids)
>> +#else
>> +    if (!cmdbuf->cmdbufs)
>> +#endif
>> +        return AVERROR(ENOMEM);
>> +
>> +#ifndef __SWITCH__
>> +    cmdbuf->relocs       = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->relocs));
>> +    cmdbuf->reloc_types  = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->reloc_types));
>> +    cmdbuf->reloc_shifts = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->reloc_shifts));
>> +    if (!cmdbuf->relocs || !cmdbuf->reloc_types || !cmdbuf->reloc_shifts)
>> +        return AVERROR(ENOMEM);
>> +#endif
>> +
>> +    cmdbuf->syncpt_incrs = av_malloc_array(NUM_INITIAL_SYNCPT_INCRS, sizeof(*cmdbuf->syncpt_incrs));
>> +#ifndef __SWITCH__
>> +    cmdbuf->fences       = av_malloc_array(NUM_INITIAL_SYNCPT_INCRS, sizeof(*cmdbuf->fences));
>> +#endif
>> +
>> +#ifndef __SWITCH__
>> +    if (!cmdbuf->syncpt_incrs || !cmdbuf->fences)
>> +#else
>> +    if (!cmdbuf->syncpt_incrs)
>> +#endif
>> +        return AVERROR(ENOMEM);
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_deinit(AVNVTegraCmdbuf *cmdbuf) {
>> +    av_freep(&cmdbuf->cmdbufs);
>> +    av_freep(&cmdbuf->syncpt_incrs);
>> +
>> +#ifndef __SWITCH__
>> +    av_freep(&cmdbuf->cmdbuf_exts), av_freep(&cmdbuf->class_ids);
>> +    av_freep(&cmdbuf->relocs), av_freep(&cmdbuf->reloc_types), av_freep(&cmdbuf->reloc_shifts);
>> +    av_freep(&cmdbuf->fences);
>> +#endif
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_add_memory(AVNVTegraCmdbuf *cmdbuf, AVNVTegraMap *map, uint32_t offset, uint32_t size) {
>> +    uint8_t *mem;
>> +
>> +    mem = av_nvtegra_map_get_addr(map);
>> +
>> +    cmdbuf->map        = map;
>> +    cmdbuf->mem_offset = offset;
>> +    cmdbuf->mem_size   = size;
>> +
>> +    cmdbuf->cur_word = (uint32_t *)(mem + cmdbuf->mem_offset);
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_clear(AVNVTegraCmdbuf *cmdbuf) {
>> +    uint8_t *mem;
>> +
>> +    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>> +
>> +    cmdbuf->num_cmdbufs = 0, cmdbuf->num_syncpt_incrs = 0;
>> +#ifndef __SWITCH__
>> +    cmdbuf->num_relocs = 0, cmdbuf->num_waitchks = 0;
>> +#endif
>> +
>> +    cmdbuf->cur_word = (uint32_t *)(mem + cmdbuf->mem_offset);
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_begin(AVNVTegraCmdbuf *cmdbuf, uint32_t class_id) {
>> +    uint8_t *mem;
>> +    void *tmp1;
>> +#ifndef __SWITCH__
>> +    void *tmp2, *tmp3;
>> +#endif
>> +
>> +    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>> +
>> +    tmp1 = av_realloc_array(cmdbuf->cmdbufs,     cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->cmdbufs));
>> +#ifndef __SWITCH__
>> +    tmp2 = av_realloc_array(cmdbuf->cmdbuf_exts, cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->cmdbuf_exts));
>> +    tmp3 = av_realloc_array(cmdbuf->class_ids,   cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->class_ids));
>> +#endif
>> +
>> +#ifndef __SWITCH__
>> +    if (!tmp1 || !tmp2 || !tmp3)
>> +#else
>> +    if (!tmp1)
>> +#endif
>> +        return AVERROR(ENOMEM);
>> +
>> +    cmdbuf->cmdbufs = tmp1;
>> +
>> +#ifndef __SWITCH__
>> +    cmdbuf->cmdbuf_exts = tmp2, cmdbuf->class_ids = tmp3;
>> +#endif
>> +
>> +    cmdbuf->cmdbufs[cmdbuf->num_cmdbufs] = (struct nvhost_cmdbuf){
>> +        .mem       = av_nvtegra_map_get_handle(cmdbuf->map),
>> +        .offset    = (uint8_t *)cmdbuf->cur_word - mem,
>> +    };
>> +
>> +#ifndef __SWITCH__
>> +    cmdbuf->cmdbuf_exts[cmdbuf->num_cmdbufs] = (struct nvhost_cmdbuf_ext){
>> +        .pre_fence = -1,
>> +    };
>> +
>> +    cmdbuf->class_ids[cmdbuf->num_cmdbufs] = class_id;
>> +#endif
>> +
>> +#ifdef __SWITCH__
>> +    if (cmdbuf->num_cmdbufs == 0)
>> +        av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_setclass(class_id, 0, 0));
>> +#endif
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_end(AVNVTegraCmdbuf *cmdbuf) {
>> +    cmdbuf->num_cmdbufs++;
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_push_word(AVNVTegraCmdbuf *cmdbuf, uint32_t word) {
>> +    uintptr_t mem_start = (uintptr_t)av_nvtegra_map_get_addr(cmdbuf->map) + cmdbuf->mem_offset;
>> +
>> +    if ((uintptr_t)cmdbuf->cur_word - mem_start >= cmdbuf->mem_size)
>> +        return AVERROR(ENOMEM);
>> +
>> +    *cmdbuf->cur_word++ = word;
>> +    cmdbuf->cmdbufs[cmdbuf->num_cmdbufs].words += 1;
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_push_value(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, uint32_t word) {
>> +    int err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_incr(NV_THI_METHOD0>>2, 2));
>> +    if (err < 0)
>> +        return err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, offset);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, word);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_push_reloc(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, AVNVTegraMap *target, uint32_t target_offset,
>> +                                 int reloc_type, int shift)
>> +{
>> +    int err;
>> +#ifndef __SWITCH__
>> +    uint8_t *mem;
>> +    void *tmp1, *tmp2, *tmp3;
>> +
>> +    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>> +
>> +    tmp1 = av_realloc_array(cmdbuf->relocs,       cmdbuf->num_relocs + 1, sizeof(*cmdbuf->relocs));
>> +    tmp2 = av_realloc_array(cmdbuf->reloc_types,  cmdbuf->num_relocs + 1, sizeof(*cmdbuf->reloc_types));
>> +    tmp3 = av_realloc_array(cmdbuf->reloc_shifts, cmdbuf->num_relocs + 1, sizeof(*cmdbuf->reloc_shifts));
>> +    if (!tmp1 || !tmp2 || !tmp3)
>> +        return AVERROR(ENOMEM);
>> +
>> +    cmdbuf->relocs = tmp1, cmdbuf->reloc_types = tmp2, cmdbuf->reloc_shifts = tmp3;
>> +
>> +    err = av_nvtegra_cmdbuf_push_value(cmdbuf, offset, 0xdeadbeef);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    cmdbuf->relocs[cmdbuf->num_relocs]       = (struct nvhost_reloc){
>> +        .cmdbuf_mem    = av_nvtegra_map_get_handle(cmdbuf->map),
>> +        .cmdbuf_offset = (uint8_t *)cmdbuf->cur_word - mem - sizeof(uint32_t),
>> +        .target        = av_nvtegra_map_get_handle(target),
>> +        .target_offset = target_offset,
>> +    };
>> +
>> +    cmdbuf->reloc_types[cmdbuf->num_relocs]  = (struct nvhost_reloc_type){
>> +        .reloc_type    = reloc_type,
>> +    };
>> +
>> +    cmdbuf->reloc_shifts[cmdbuf->num_relocs] = (struct nvhost_reloc_shift){
>> +        .shift         = shift,
>> +    };
>> +
>> +    cmdbuf->num_relocs++;
>> +
>> +    return 0;
>> +#else
>> +    err = av_nvtegra_cmdbuf_push_value(cmdbuf, offset, (target->iova + target_offset) >> shift);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    return 0;
>> +#endif
>> +}
>> +
>> +int av_nvtegra_cmdbuf_push_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt) {
>> +    int err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_nonincr(NV_THI_INCR_SYNCPT>>2, 1));
>> +    if (err < 0)
>> +        return err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf,
>> +                                      AV_NVTEGRA_VALUE(NV_THI_INCR_SYNCPT, INDX, syncpt) |
>> +                                      AV_NVTEGRA_ENUM (NV_THI_INCR_SYNCPT, COND, OP_DONE));
>> +    if (err < 0)
>> +        return err;
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_push_wait(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence) {
>> +    int err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_setclass(HOST1X_CLASS_HOST1X, 0, 0));
>> +    if (err < 0)
>> +        return err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_mask(NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD>>2,
>> +                                      (1<<(NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD - NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD)) |
>> +                                      (1<<(NV_CLASS_HOST_WAIT_SYNCPT         - NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD))));
>> +    if (err < 0)
>> +        return err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, fence);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    err = av_nvtegra_cmdbuf_push_word(cmdbuf, syncpt);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_cmdbuf_add_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence)
>> +{
>> +    void *tmp1;
>> +#ifndef __SWITCH__
>> +    void *tmp2;
>> +#endif
>> +
>> +    tmp1 = av_realloc_array(cmdbuf->syncpt_incrs, cmdbuf->num_syncpt_incrs + 1, sizeof(*cmdbuf->syncpt_incrs));
>> +#ifndef __SWITCH__
>> +    tmp2 = av_realloc_array(cmdbuf->fences,       cmdbuf->num_syncpt_incrs + 1, sizeof(*cmdbuf->fences));
>> +#endif
>> +
>> +#ifndef __SWITCH__
>> +    if (!tmp1 || !tmp2)
>> +#else
>> +    if (!tmp1)
>> +#endif
>> +        return AVERROR(ENOMEM);
>> +
>> +    cmdbuf->syncpt_incrs = tmp1;
>> +#ifndef __SWITCH__
>> +    cmdbuf->fences       = tmp2;
>> +#endif
>> +
>> +    cmdbuf->syncpt_incrs[cmdbuf->num_syncpt_incrs] = (struct nvhost_syncpt_incr){
>> +        .syncpt_id    = syncpt,
>> +        .syncpt_incrs = 1,
>> +    };
>> +
>> +#ifndef __SWITCH__
>> +    cmdbuf->fences[cmdbuf->num_syncpt_incrs]       = fence;
>> +#endif
>> +
>> +    cmdbuf->num_syncpt_incrs++;
>> +
>> +    return av_nvtegra_cmdbuf_push_syncpt_incr(cmdbuf, syncpt);
>> +}
>> +
>> +int av_nvtegra_cmdbuf_add_waitchk(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence) {
>> +#ifndef __SWITCH__
>> +    uint8_t *mem;
>> +    void *tmp;
>> +
>> +    mem = av_nvtegra_map_get_addr(cmdbuf->map);
>> +
>> +    tmp = av_realloc_array(cmdbuf->waitchks, cmdbuf->num_waitchks + 1, sizeof(*cmdbuf->waitchks));
>> +    if (!tmp)
>> +        return AVERROR(ENOMEM);
>> +
>> +    cmdbuf->waitchks = tmp;
>> +
>> +    cmdbuf->waitchks[cmdbuf->num_waitchks] = (struct nvhost_waitchk){
>> +        .mem       = av_nvtegra_map_get_handle(cmdbuf->map),
>> +        .offset    = (uint8_t *)cmdbuf->cur_word - mem - sizeof(uint32_t),
>> +        .syncpt_id = syncpt,
>> +        .thresh    = fence,
>> +    };
>> +
>> +    cmdbuf->num_waitchks++;
>> +#endif
>> +
>> +    return av_nvtegra_cmdbuf_push_wait(cmdbuf, syncpt, fence);
>> +}
>> +
>> +static void nvtegra_job_free(void *opaque, uint8_t *data) {
>> +    AVNVTegraJob *job = (AVNVTegraJob *)data;
>> +
>> +    if (!job)
>> +        return;
>> +
>> +    av_nvtegra_cmdbuf_deinit(&job->cmdbuf);
>> +    av_nvtegra_map_destroy(&job->input_map);
>> +
>> +    av_freep(&job);
>> +}
>> +
>> +static AVBufferRef *nvtegra_job_alloc(void *opaque, size_t size) {
>> +    AVNVTegraJobPool *pool = opaque;
>> +
>> +    AVBufferRef  *buffer;
>> +    AVNVTegraJob *job;
>> +    int err;
>> +
>> +    job = av_mallocz(sizeof(*job));
>> +    if (!job)
>> +        return NULL;
>> +
>> +    err = av_nvtegra_map_create(&job->input_map, pool->channel, pool->input_map_size, 0x100,
>> +                                NVMAP_HEAP_IOVMM, NVMAP_HANDLE_WRITE_COMBINE);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    err = av_nvtegra_cmdbuf_init(&job->cmdbuf);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    err = av_nvtegra_cmdbuf_add_memory(&job->cmdbuf, &job->input_map, pool->cmdbuf_off, pool->max_cmdbuf_size);
>> +    if (err < 0)
>> +        goto fail;
>> +
>> +    buffer = av_buffer_create((uint8_t *)job, sizeof(*job), nvtegra_job_free, pool, 0);
>> +    if (!buffer)
>> +        goto fail;
>> +
>> +    return buffer;
>> +
>> +fail:
>> +    av_nvtegra_cmdbuf_deinit(&job->cmdbuf);
>> +    av_nvtegra_map_destroy(&job->input_map);
>> +    av_freep(job);
>> +    return NULL;
>> +}
>> +
>> +int av_nvtegra_job_pool_init(AVNVTegraJobPool *pool, AVNVTegraChannel *channel,
>> +                             size_t input_map_size, off_t cmdbuf_off, size_t max_cmdbuf_size)
>> +{
>> +    pool->channel         = channel;
>> +    pool->input_map_size  = input_map_size;
>> +    pool->cmdbuf_off      = cmdbuf_off;
>> +    pool->max_cmdbuf_size = max_cmdbuf_size;
>> +    pool->pool            = av_buffer_pool_init2(sizeof(AVNVTegraJob), pool,
>> +                                                 nvtegra_job_alloc, NULL);
>> +    if (!pool->pool)
>> +        return AVERROR(ENOMEM);
>> +
>> +    return 0;
>> +}
>> +
>> +int av_nvtegra_job_pool_uninit(AVNVTegraJobPool *pool) {
>> +    av_buffer_pool_uninit(&pool->pool);
>> +    return 0;
>> +}
>> +
>> +AVBufferRef *av_nvtegra_job_pool_get(AVNVTegraJobPool *pool) {
>> +    return av_buffer_pool_get(pool->pool);
>> +}
>> +
>> +int av_nvtegra_job_submit(AVNVTegraJobPool *pool, AVNVTegraJob *job) {
>> +    return av_nvtegra_channel_submit(pool->channel, &job->cmdbuf, &job->fence);
>> +}
>> +
>> +int av_nvtegra_job_wait(AVNVTegraJobPool *pool, AVNVTegraJob *job, int timeout) {
>> +    return av_nvtegra_syncpt_wait(pool->channel, job->fence, timeout);
>> +}
>> diff --git a/libavutil/nvtegra.h b/libavutil/nvtegra.h
>> new file mode 100644
>> index 0000000000..3b63335d6c
>> --- /dev/null
>> +++ b/libavutil/nvtegra.h
>> @@ -0,0 +1,258 @@
>> +/*
>> + * Copyright (c) 2024 averne <averne381@gmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
>> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>> + */
>> +
>> +#ifndef AVUTIL_NVTEGRA_H
>> +#define AVUTIL_NVTEGRA_H
>> +
>> +#include <stdint.h>
>> +#include <stdbool.h>
>> +
>> +#include "buffer.h"
>> +
>> +#include "nvhost_ioctl.h"
>> +#include "nvmap_ioctl.h"
>> +
>> +typedef struct AVNVTegraChannel {
>> +#ifndef __SWITCH__
>> +    int fd;
>> +    int module_id;
>> +#else
>> +    NvChannel channel;
>> +#endif
>> +
>> +    uint32_t syncpt;
>> +
>> +#ifdef __SWITCH__
>> +    MmuRequest mmu_request;
>> +#endif
>> +    uint32_t clock;
>> +} AVNVTegraChannel;
>> +
>> +typedef struct AVNVTegraMap {
>> +#ifndef __SWITCH__
>> +    uint32_t handle;
>> +    uint32_t size;
>> +    void *cpu_addr;
>> +#else
>> +    NvMap map;
>> +    uint32_t iova;
>> +    uint32_t owner;
>> +#endif
>> +    bool is_linear;
>> +} AVNVTegraMap;
>> +
>> +typedef struct AVNVTegraCmdbuf {
>> +    AVNVTegraMap *map;
>> +
>> +    uint32_t mem_offset, mem_size;
>> +
>> +    uint32_t *cur_word;
>> +
>> +    struct nvhost_cmdbuf       *cmdbufs;
>> +#ifndef __SWITCH__
>> +    struct nvhost_cmdbuf_ext   *cmdbuf_exts;
>> +    uint32_t                   *class_ids;
>> +#endif
>> +    uint32_t num_cmdbufs;
>> +
>> +#ifndef __SWITCH__
>> +    struct nvhost_reloc        *relocs;
>> +    struct nvhost_reloc_type   *reloc_types;
>> +    struct nvhost_reloc_shift  *reloc_shifts;
>> +    uint32_t num_relocs;
>> +#endif
>> +
>> +    struct nvhost_syncpt_incr  *syncpt_incrs;
>> +#ifndef __SWITCH__
>> +    uint32_t                   *fences;
>> +#endif
>> +    uint32_t num_syncpt_incrs;
>> +
>> +#ifndef __SWITCH__
>> +    struct nvhost_waitchk      *waitchks;
>> +    uint32_t num_waitchks;
>> +#endif
>> +} AVNVTegraCmdbuf;
>> +
>> +typedef struct AVNVTegraJobPool {
>> +    /*
>> +     * Pool object for job allocation
>> +     */
>> +    AVBufferPool *pool;
>> +
>> +    /*
>> +     * Hardware channel the jobs will be submitted to
>> +     */
>> +    AVNVTegraChannel *channel;
>> +
>> +    /*
>> +     * Total size of the input memory-mapped buffer
>> +     */
>> +    size_t input_map_size;
>> +
>> +    /*
>> +     * Offset of the command data within the input map
>> +     */
>> +    off_t cmdbuf_off;
>> +
>> +    /*
>> +     * Maximum memory usable by the command buffer
>> +     */
>> +    size_t max_cmdbuf_size;
>> +} AVNVTegraJobPool;
>> +
>> +typedef struct AVNVTegraJob {
>> +    /*
>> +     * Memory-mapped buffer for command buffers, metadata structures, ...
>> +     */
>> +    AVNVTegraMap input_map;
>> +
>> +    /*
>> +     * Object for command recording
>> +     */
>> +    AVNVTegraCmdbuf cmdbuf;
>> +
>> +    /*
>> +     * Fence indicating completion of the job
>> +     */
>> +    uint32_t fence;
>> +} AVNVTegraJob;
>> +
>> +AVBufferRef *av_nvtegra_driver_init(void);
>> +
>> +int av_nvtegra_channel_open(AVNVTegraChannel *channel, const char *dev);
>> +int av_nvtegra_channel_close(AVNVTegraChannel *channel);
>> +int av_nvtegra_channel_get_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t *clock_rate);
>> +int av_nvtegra_channel_set_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t clock_rate);
>> +int av_nvtegra_channel_submit(AVNVTegraChannel *channel, AVNVTegraCmdbuf *cmdbuf, uint32_t *fence);
>> +int av_nvtegra_channel_set_submit_timeout(AVNVTegraChannel *channel, uint32_t timeout_ms);
>> +
>> +int av_nvtegra_syncpt_wait(AVNVTegraChannel *channel, uint32_t threshold, int32_t timeout);
>> +
>> +int av_nvtegra_map_allocate(AVNVTegraMap *map, AVNVTegraChannel *owner, uint32_t size,
>> +                            uint32_t align, int heap_mask, int flags);
>> +int av_nvtegra_map_free(AVNVTegraMap *map);
>> +int av_nvtegra_map_from_va(AVNVTegraMap *map, AVNVTegraChannel *owner, void *mem,
>> +                           uint32_t size, uint32_t align, uint32_t flags);
>> +int av_nvtegra_map_close(AVNVTegraMap *map);
>> +int av_nvtegra_map_map(AVNVTegraMap *map);
>> +int av_nvtegra_map_unmap(AVNVTegraMap *map);
>> +int av_nvtegra_map_cache_op(AVNVTegraMap *map, int op, void *addr, size_t len);
>> +int av_nvtegra_map_realloc(AVNVTegraMap *map, uint32_t size, uint32_t align, int heap_mask, int flags);
>> +
>> +static inline int av_nvtegra_map_create(AVNVTegraMap *map, AVNVTegraChannel *owner, uint32_t size, uint32_t align,
>> +                                        int heap_mask, int flags)
>> +{
>> +    int err;
>> +
>> +    err = av_nvtegra_map_allocate(map, owner, size, align, heap_mask, flags);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    return av_nvtegra_map_map(map);
>> +}
>> +
>> +static inline int av_nvtegra_map_destroy(AVNVTegraMap *map) {
>> +    int err;
>> +
>> +    err = av_nvtegra_map_unmap(map);
>> +    if (err < 0)
>> +        return err;
>> +
>> +    return av_nvtegra_map_free(map);
>> +}
>> +
>> +int av_nvtegra_cmdbuf_init(AVNVTegraCmdbuf *cmdbuf);
>> +int av_nvtegra_cmdbuf_deinit(AVNVTegraCmdbuf *cmdbuf);
>> +int av_nvtegra_cmdbuf_add_memory(AVNVTegraCmdbuf *cmdbuf, AVNVTegraMap *map, uint32_t offset, uint32_t size);
>> +int av_nvtegra_cmdbuf_clear(AVNVTegraCmdbuf *cmdbuf);
>> +int av_nvtegra_cmdbuf_begin(AVNVTegraCmdbuf *cmdbuf, uint32_t class_id);
>> +int av_nvtegra_cmdbuf_end(AVNVTegraCmdbuf *cmdbuf);
>> +int av_nvtegra_cmdbuf_push_word(AVNVTegraCmdbuf *cmdbuf, uint32_t word);
>> +int av_nvtegra_cmdbuf_push_value(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, uint32_t word);
>> +int av_nvtegra_cmdbuf_push_reloc(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, AVNVTegraMap *target, uint32_t target_offset,
>> +                                 int reloc_type, int shift);
>> +int av_nvtegra_cmdbuf_push_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt);
>> +int av_nvtegra_cmdbuf_push_wait(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
>> +int av_nvtegra_cmdbuf_add_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
>> +int av_nvtegra_cmdbuf_add_waitchk(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
>> +
>> +/*
>> + * Job allocation and submission routines
>> + */
>> +int av_nvtegra_job_pool_init(AVNVTegraJobPool *pool, AVNVTegraChannel *channel,
>> +                             size_t input_map_size, off_t cmdbuf_off, size_t max_cmdbuf_size);
>> +int av_nvtegra_job_pool_uninit(AVNVTegraJobPool *pool);
>> +AVBufferRef *av_nvtegra_job_pool_get(AVNVTegraJobPool *pool);
>> +
>> +int av_nvtegra_job_submit(AVNVTegraJobPool *pool, AVNVTegraJob *job);
>> +int av_nvtegra_job_wait(AVNVTegraJobPool *pool, AVNVTegraJob *job, int timeout);
>> +
>> +static inline uint32_t av_nvtegra_map_get_handle(AVNVTegraMap *map) {
>> +#ifndef __SWITCH__
>> +    return map->handle;
>> +#else
>> +    return map->map.handle;
>> +#endif
>> +}
>> +
>> +static inline void *av_nvtegra_map_get_addr(AVNVTegraMap *map) {
>> +#ifndef __SWITCH__
>> +    return map->cpu_addr;
>> +#else
>> +    return map->map.cpu_addr;
>> +#endif
>> +}
>> +
>> +static inline uint32_t av_nvtegra_map_get_size(AVNVTegraMap *map) {
>> +#ifndef __SWITCH__
>> +    return map->size;
>> +#else
>> +    return map->map.size;
>> +#endif
>> +}
>> +
>> +/* Addresses are shifted by 8 bits in the command buffer, requiring an alignment to 256 */
>> +#define AV_NVTEGRA_MAP_ALIGN (1 << 8)
>> +
>> +#define AV_NVTEGRA_VALUE(offset, field, value)                                                    \
>> +    ((value &                                                                                     \
>> +    ((uint32_t)((UINT64_C(1) << ((1?offset ## _ ## field) - (0?offset ## _ ## field) + 1)) - 1))) \
>> +    << (0?offset ## _ ## field))
>> +
>> +#define AV_NVTEGRA_ENUM(offset, field, value)                                                     \
>> +    ((offset ## _ ## field ## _ ## value &                                                        \
>> +    ((uint32_t)((UINT64_C(1) << ((1?offset ## _ ## field) - (0?offset ## _ ## field) + 1)) - 1))) \
>> +    << (0?offset ## _ ## field))
>> +
>> +#define AV_NVTEGRA_PUSH_VALUE(cmdbuf, offset, value) ({                                  \
>> +    int _err = av_nvtegra_cmdbuf_push_value(cmdbuf, (offset) / sizeof(uint32_t), value); \
>> +    if (_err < 0)                                                                        \
>> +        return _err;                                                                     \
>> +})
>> +
>> +#define AV_NVTEGRA_PUSH_RELOC(cmdbuf, offset, target, target_offset, type) ({    \
>> +    int _err = av_nvtegra_cmdbuf_push_reloc(cmdbuf, (offset) / sizeof(uint32_t), \
>> +                                            target, target_offset, type, 8);     \
>> +    if (_err < 0)                                                                \
>> +        return _err;                                                             \
>> +})
>> +
>> +#endif /* AVUTIL_NVTEGRA_H */
>> diff --git a/libavutil/nvtegra_host1x.h b/libavutil/nvtegra_host1x.h
>> new file mode 100644
>> index 0000000000..25e37eae61
>> --- /dev/null
>> +++ b/libavutil/nvtegra_host1x.h
>> @@ -0,0 +1,94 @@
>> +/*
>> + * Copyright (c) 2024 averne <averne381@gmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
>> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>> + */
>> +
>> +#ifndef AVUTIL_NVTEGRA_HOST1X_H
>> +#define AVUTIL_NVTEGRA_HOST1X_H
>> +
>> +#include <stdint.h>
>> +
>> +#include "macros.h"
>> +
>> +/* From L4T include/linux/host1x.h */
>> +enum host1x_class {
>> +    HOST1X_CLASS_HOST1X  = 0x01,
>> +    HOST1X_CLASS_NVENC   = 0x21,
>> +    HOST1X_CLASS_VI      = 0x30,
>> +    HOST1X_CLASS_ISPA    = 0x32,
>> +    HOST1X_CLASS_ISPB    = 0x34,
>> +    HOST1X_CLASS_GR2D    = 0x51,
>> +    HOST1X_CLASS_GR2D_SB = 0x52,
>> +    HOST1X_CLASS_VIC     = 0x5d,
>> +    HOST1X_CLASS_GR3D    = 0x60,
>> +    HOST1X_CLASS_NVJPG   = 0xc0,
>> +    HOST1X_CLASS_NVDEC   = 0xf0,
>> +};
>> +
>> +static inline uint32_t host1x_opcode_setclass(unsigned class_id, unsigned offset, unsigned mask) {
>> +    return (0 << 28) | (offset << 16) | (class_id << 6) | mask;
>> +}
>> +
>> +static inline uint32_t host1x_opcode_incr(unsigned offset, unsigned count) {
>> +    return (1 << 28) | (offset << 16) | count;
>> +}
>> +
>> +static inline uint32_t host1x_opcode_nonincr(unsigned offset, unsigned count) {
>> +    return (2 << 28) | (offset << 16) | count;
>> +}
>> +
>> +static inline uint32_t host1x_opcode_mask(unsigned offset, unsigned mask) {
>> +    return (3 << 28) | (offset << 16) | mask;
>> +}
>> +
>> +static inline uint32_t host1x_opcode_imm(unsigned offset, unsigned value) {
>> +    return (4 << 28) | (offset << 16) | value;
>> +}
>> +
>> +#define NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD                                  (0x00000138)
>> +#define NV_CLASS_HOST_WAIT_SYNCPT                                          (0x00000140)
>> +
>> +#define NV_THI_INCR_SYNCPT                                                 (0x00000000)
>> +#define NV_THI_INCR_SYNCPT_INDX                                            7:0
>> +#define NV_THI_INCR_SYNCPT_COND                                            15:8
>> +#define NV_THI_INCR_SYNCPT_COND_IMMEDIATE                                  (0x00000000)
>> +#define NV_THI_INCR_SYNCPT_COND_OP_DONE                                    (0x00000001)
>> +#define NV_THI_INCR_SYNCPT_ERR                                             (0x00000008)
>> +#define NV_THI_INCR_SYNCPT_ERR_COND_STS_IMM                                0:0
>> +#define NV_THI_INCR_SYNCPT_ERR_COND_STS_OPDONE                             1:1
>> +#define NV_THI_CTXSW_INCR_SYNCPT                                           (0x0000000c)
>> +#define NV_THI_CTXSW_INCR_SYNCPT_INDX                                      7:0
>> +#define NV_THI_CTXSW                                                       (0x00000020)
>> +#define NV_THI_CTXSW_CURR_CLASS                                            9:0
>> +#define NV_THI_CTXSW_AUTO_ACK                                              11:11
>> +#define NV_THI_CTXSW_CURR_CHANNEL                                          15:12
>> +#define NV_THI_CTXSW_NEXT_CLASS                                            25:16
>> +#define NV_THI_CTXSW_NEXT_CHANNEL                                          31:28
>> +#define NV_THI_CONT_SYNCPT_EOF                                             (0x00000028)
>> +#define NV_THI_CONT_SYNCPT_EOF_INDEX                                       7:0
>> +#define NV_THI_CONT_SYNCPT_EOF_COND                                        8:8
>> +#define NV_THI_METHOD0                                                     (0x00000040)
>> +#define NV_THI_METHOD0_OFFSET                                              11:0
>> +#define NV_THI_METHOD1                                                     (0x00000044)
>> +#define NV_THI_METHOD1_DATA                                                31:0
>> +#define NV_THI_INT_STATUS                                                  (0x00000078)
>> +#define NV_THI_INT_STATUS_FALCON_INT                                       0:0
>> +#define NV_THI_INT_MASK                                                    (0x0000007c)
>> +#define NV_THI_INT_MASK_FALCON_INT                                         0:0
>> +
>> +#endif /* AVUTIL_NVTEGRA_HOST1X_H */
>> diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
>> index 1c0bcf2232..bb14b1b306 100644
>> --- a/libavutil/pixdesc.c
>> +++ b/libavutil/pixdesc.c
>> @@ -2791,6 +2791,10 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
>>         },
>>         .flags = AV_PIX_FMT_FLAG_PLANAR,
>>     },
>> +    [AV_PIX_FMT_NVTEGRA] = {
>> +        .name = "nvtegra",
>> +        .flags = AV_PIX_FMT_FLAG_HWACCEL,
>> +    },
>> };
>>
>> static const char * const color_range_names[] = {
>> diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
>> index a7f50e1690..a3213c792a 100644
>> --- a/libavutil/pixfmt.h
>> +++ b/libavutil/pixfmt.h
>> @@ -439,6 +439,14 @@ enum AVPixelFormat {
>>      */
>>     AV_PIX_FMT_D3D12,
>>
>> +    /**
>> +     * Hardware surfaces for Tegra devices.
>> +     *
>> +     * data[0..2] points to memory-mapped buffer containing frame data
>> +     * buf[0] contains an AVBufferRef to an AVNTegraMap
>> +     */
>> +    AV_PIX_FMT_NVTEGRA,
>> +
>>     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
>> };
>>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Rémi Denis-Courmont June 1, 2024, 7:29 a.m. UTC | #3
Le lauantaina 1. kesäkuuta 2024, 0.06.55 EEST averne a écrit :
> As for your second question, I probably should've given some
> context about this decision. Initially I thought about writing a
> vaapi driver, but for a number of reasons I decided against it.

VA-API would be difficult anyway as it is tied to Linux DRM. My question remains 
though: why isn't this implemented with the NVDEC API? FFmpeg already support 
NVDEC natively with the same set of codecs as this patchset, and presumably 
with all the necessary codec state tracking.

This would also address Timo's concerns by moving the driver into a library, 
and shield FFmpeg from hypothetical L4T ABI breaks.
Mark Thompson June 5, 2024, 8:29 p.m. UTC | #4
On 30/05/2024 20:43, averne wrote:
> This includes a new pixel format for nvtegra hardware frames, and several objects for interaction with hardware blocks.
> In particular, this contains code for channels (handles to hardware engines), maps (memory-mapped buffers shared with engines), and command buffers (abstraction for building command lists sent to the engines).
> 
> Signed-off-by: averne <averne381@gmail.com>
> ---
>  configure                  |    2 +
>  libavutil/Makefile         |    4 +
>  libavutil/nvtegra.c        | 1035 ++++++++++++++++++++++++++++++++++++
>  libavutil/nvtegra.h        |  258 +++++++++
>  libavutil/nvtegra_host1x.h |   94 ++++
>  libavutil/pixdesc.c        |    4 +
>  libavutil/pixfmt.h         |    8 +
>  7 files changed, 1405 insertions(+)
>  create mode 100644 libavutil/nvtegra.c
>  create mode 100644 libavutil/nvtegra.h
>  create mode 100644 libavutil/nvtegra_host1x.h

I don't think it is reasonable for all of this to be public API surface of ffmpeg.

A separate library containing the headers and exposing some set of functions like this might make more sense.

If this has to be in ffmpeg then it really needs to all go in one library (libavcodec I guess) so that it's not exposing all this internal detail in the public API.

Thanks,

- Mark
diff mbox series

Patch

diff --git a/configure b/configure
index 09fb2aed1b..51f169bfbd 100755
--- a/configure
+++ b/configure
@@ -361,6 +361,7 @@  External library support:
   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
   --disable-videotoolbox   disable VideoToolbox code [autodetect]
   --disable-vulkan         disable Vulkan code [autodetect]
+  --enable-nvtegra         enable nvtegra code [no]
 
 Toolchain options:
   --arch=ARCH              select architecture [$arch]
@@ -3151,6 +3152,7 @@  videotoolbox_hwaccel_deps="videotoolbox pthreads"
 videotoolbox_hwaccel_extralibs="-framework QuartzCore"
 vulkan_deps="threads"
 vulkan_deps_any="libdl LoadLibrary"
+nvtegra_deps="gpl"
 
 av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
 av1_d3d11va_hwaccel_select="av1_decoder"
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 9c112bc58a..733a23a8a3 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -52,6 +52,7 @@  HEADERS = adler32.h                                                     \
           hwcontext_videotoolbox.h                                      \
           hwcontext_vdpau.h                                             \
           hwcontext_vulkan.h                                            \
+          nvtegra.h                                                     \
           nvhost_ioctl.h                                                \
           nvmap_ioctl.h                                                 \
           iamf.h                                                        \
@@ -209,6 +210,7 @@  OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
 OBJS-$(CONFIG_VULKAN)                   += hwcontext_vulkan.o vulkan.o
 
 OBJS-$(!CONFIG_VULKAN)                  += hwcontext_stub.o
+OBJS-$(CONFIG_NVTEGRA)                  += nvtegra.o
 
 OBJS += $(COMPAT_OBJS:%=../compat/%)
 
@@ -230,6 +232,8 @@  SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
 SKIPHEADERS-$(CONFIG_VULKAN)           += hwcontext_vulkan.h vulkan.h   \
                                           vulkan_functions.h            \
                                           vulkan_loader.h
+SKIPHEADERS-$(CONFIG_NVTEGRA)          += nvtegra.h                     \
+                                          nvtegra_host1x.h
 
 TESTPROGS = adler32                                                     \
             aes                                                         \
diff --git a/libavutil/nvtegra.c b/libavutil/nvtegra.c
new file mode 100644
index 0000000000..ad0bbbdfaa
--- /dev/null
+++ b/libavutil/nvtegra.c
@@ -0,0 +1,1035 @@ 
+/*
+ * Copyright (c) 2024 averne <averne381@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef __SWITCH__
+#   include <sys/ioctl.h>
+#   include <sys/mman.h>
+#   include <fcntl.h>
+#   include <unistd.h>
+#else
+#   include <stdlib.h>
+#   include <switch.h>
+#endif
+
+#include <string.h>
+
+#include "buffer.h"
+#include "log.h"
+#include "error.h"
+#include "mem.h"
+#include "thread.h"
+
+#include "nvhost_ioctl.h"
+#include "nvmap_ioctl.h"
+#include "nvtegra_host1x.h"
+
+#include "nvtegra.h"
+
+/*
+ * Tag used by the kernel to identify allocations.
+ * Official software has been seen using 0x900, 0xf00, 0x1100, 0x1400, 0x4000.
+ */
+#define MEM_TAG (0xfeed)
+
+struct DriverState {
+    int nvmap_fd, nvhost_fd;
+};
+
+static AVMutex g_driver_init_mtx = AV_MUTEX_INITIALIZER;
+static struct DriverState *g_driver_state = NULL;
+static AVBufferRef *g_driver_state_ref = NULL;
+
+static void free_driver_fds(void *opaque, uint8_t *data) {
+    if (!g_driver_state)
+        return;
+
+#ifndef __SWITCH__
+    if (g_driver_state->nvmap_fd > 0)
+        close(g_driver_state->nvmap_fd);
+
+    if (g_driver_state->nvhost_fd > 0)
+        close(g_driver_state->nvhost_fd);
+#else
+    nvFenceExit();
+    nvMapExit();
+    nvExit();
+    mmuExit();
+#endif
+
+    g_driver_init_mtx  = (AVMutex)AV_MUTEX_INITIALIZER;
+    g_driver_state_ref = NULL;
+    av_freep(&g_driver_state);
+}
+
+static int init_driver_fds(void) {
+    AVBufferRef *ref;
+    struct DriverState *state;
+    int err;
+
+    state = av_mallocz(sizeof(*state));
+    if (!state)
+        return AVERROR(ENOMEM);
+
+    ref = av_buffer_create((uint8_t *)state, sizeof(*state), free_driver_fds, NULL, 0);
+    if (!state)
+        return AVERROR(ENOMEM);
+
+    g_driver_state     = state;
+    g_driver_state_ref = ref;
+
+#ifndef __SWITCH__
+    err = open("/dev/nvmap", O_RDWR | O_SYNC);
+    if (err < 0)
+        return AVERROR(errno);
+    state->nvmap_fd = err;
+
+    err = open("/dev/nvhost-ctrl", O_RDWR | O_SYNC);
+    if (err < 0)
+        return AVERROR(errno);
+    state->nvhost_fd = err;
+#else
+    err = nvInitialize();
+    if (R_FAILED(err))
+        return AVERROR(err);
+
+    err = nvMapInit();
+    if (R_FAILED(err))
+        return AVERROR(err);
+    state->nvmap_fd = nvMapGetFd();
+
+    err = nvFenceInit();
+    if (R_FAILED(err))
+        return AVERROR(err);
+    /* libnx doesn't export the nvhost-ctrl file descriptor */
+
+    err = mmuInitialize();
+    if (R_FAILED(err))
+        return AVERROR(err);
+#endif
+
+    return 0;
+}
+
+static inline int get_nvmap_fd(void) {
+    if (!g_driver_state)
+        return AVERROR_UNKNOWN;
+
+    if (!g_driver_state->nvmap_fd)
+        return AVERROR_UNKNOWN;
+
+    return g_driver_state->nvmap_fd;
+}
+
+static inline int get_nvhost_fd(void) {
+    if (!g_driver_state)
+        return AVERROR_UNKNOWN;
+
+    if (!g_driver_state->nvhost_fd)
+        return AVERROR_UNKNOWN;
+
+    return g_driver_state->nvhost_fd;
+}
+
+AVBufferRef *av_nvtegra_driver_init(void) {
+    AVBufferRef *out = NULL;
+    int err;
+
+    /*
+     * We have to do this overly complex dance of putting driver fds in a refcounted struct,
+     * otherwise initializing multiple hwcontexts would leak fds
+     */
+
+    err = ff_mutex_lock(&g_driver_init_mtx);
+    if (err != 0)
+        goto exit;
+
+    if (g_driver_state_ref) {
+        out = av_buffer_ref(g_driver_state_ref);
+        goto exit;
+    }
+
+    err = init_driver_fds();
+    if (err < 0) {
+        /* In case memory allocations failed, call the destructor ourselves */
+        av_buffer_unref(&g_driver_state_ref);
+        free_driver_fds(NULL, NULL);
+        goto exit;
+    }
+
+    out = g_driver_state_ref;
+
+exit:
+    ff_mutex_unlock(&g_driver_init_mtx);
+    return out;
+}
+
+int av_nvtegra_channel_open(AVNVTegraChannel *channel, const char *dev) {
+    int err;
+#ifndef __SWITCH__
+    struct nvhost_get_param_arg args;
+
+    err = open(dev, O_RDWR);
+    if (err < 0)
+        return AVERROR(errno);
+
+    channel->fd = err;
+
+    args = (struct nvhost_get_param_arg){0};
+
+    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_GET_SYNCPOINT, &args);
+    if (err < 0)
+        goto fail;
+
+    channel->syncpt = args.value;
+
+    return 0;
+
+fail:
+    close(channel->fd);
+    return AVERROR(errno);
+#else
+    err = nvChannelCreate(&channel->channel, dev);
+    if (R_FAILED(err))
+        return AVERROR(err);
+
+    err = nvioctlChannel_GetSyncpt(channel->channel.fd, 0, &channel->syncpt);
+    if (R_FAILED(err))
+        goto fail;
+
+    return 0;
+
+fail:
+    nvChannelClose(&channel->channel);
+    return AVERROR(err);
+#endif
+}
+
+int av_nvtegra_channel_close(AVNVTegraChannel *channel) {
+#ifndef __SWITCH__
+    if (!channel->fd)
+        return 0;
+
+    return close(channel->fd);
+#else
+    nvChannelClose(&channel->channel);
+    return 0;
+#endif
+}
+
+int av_nvtegra_channel_get_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t *clock_rate) {
+    int err;
+#ifndef __SWITCH__
+    struct nvhost_clk_rate_args args;
+
+    args = (struct nvhost_clk_rate_args){
+        .moduleid = moduleid,
+    };
+
+    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_GET_CLK_RATE, &args);
+    if (err < 0)
+        return AVERROR(errno);
+
+    if (clock_rate)
+        *clock_rate = args.rate;
+
+    return 0;
+#else
+    uint32_t tmp;
+
+    err = AVERROR(nvioctlChannel_GetModuleClockRate(channel->channel.fd, moduleid, &tmp));
+    if (err < 0)
+        return err;
+
+    if (clock_rate)
+        *clock_rate = tmp * 1000;
+
+    return 0;
+#endif
+}
+
+int av_nvtegra_channel_set_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t clock_rate) {
+#ifndef __SWITCH__
+    struct nvhost_clk_rate_args args;
+
+    args = (struct nvhost_clk_rate_args){
+        .rate     = clock_rate,
+        .moduleid = moduleid,
+    };
+
+    return (ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SET_CLK_RATE, &args) < 0) ? AVERROR(errno) : 0;
+#else
+    return AVERROR(nvioctlChannel_SetModuleClockRate(channel->channel.fd, moduleid, clock_rate / 1000));
+#endif
+}
+
+int av_nvtegra_channel_submit(AVNVTegraChannel *channel, AVNVTegraCmdbuf *cmdbuf, uint32_t *fence) {
+    int err;
+#ifndef __SWITCH__
+    struct nvhost_submit_args args;
+
+    args = (struct nvhost_submit_args){
+        .submit_version          = NVHOST_SUBMIT_VERSION_V2,
+        .num_syncpt_incrs        = cmdbuf->num_syncpt_incrs,
+        .num_cmdbufs             = cmdbuf->num_cmdbufs,
+        .num_relocs              = cmdbuf->num_relocs,
+        .num_waitchks            = cmdbuf->num_waitchks,
+        .timeout                 = 0,
+        .flags                   = 0,
+        .fence                   = 0,
+        .syncpt_incrs            = (uintptr_t)cmdbuf->syncpt_incrs,
+        .cmdbuf_exts             = (uintptr_t)cmdbuf->cmdbuf_exts,
+        .checksum_methods        = 0,
+        .checksum_falcon_methods = 0,
+        .pad                     = { 0 },
+        .reloc_types             = (uintptr_t)cmdbuf->reloc_types,
+        .cmdbufs                 = (uintptr_t)cmdbuf->cmdbufs,
+        .relocs                  = (uintptr_t)cmdbuf->relocs,
+        .reloc_shifts            = (uintptr_t)cmdbuf->reloc_shifts,
+        .waitchks                = (uintptr_t)cmdbuf->waitchks,
+        .waitbases               = 0,
+        .class_ids               = (uintptr_t)cmdbuf->class_ids,
+        .fences                  = (uintptr_t)cmdbuf->fences,
+    };
+
+    err = ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SUBMIT, &args);
+    if (err < 0)
+        return AVERROR(errno);
+
+    if (fence)
+        *fence = args.fence;
+
+    return 0;
+#else
+    nvioctl_fence tmp;
+
+    err = nvioctlChannel_Submit(channel->channel.fd, (nvioctl_cmdbuf *)cmdbuf->cmdbufs, cmdbuf->num_cmdbufs,
+                                NULL, NULL, 0, (nvioctl_syncpt_incr *)cmdbuf->syncpt_incrs, cmdbuf->num_syncpt_incrs,
+                                &tmp, 1);
+    if (R_FAILED(err))
+        return AVERROR(err);
+
+    if (fence)
+        *fence = tmp.value;
+
+    return 0;
+#endif
+}
+
+int av_nvtegra_channel_set_submit_timeout(AVNVTegraChannel *channel, uint32_t timeout_ms) {
+#ifndef __SWITCH__
+    struct nvhost_set_timeout_args args;
+
+    args = (struct nvhost_set_timeout_args){
+        .timeout = timeout_ms,
+    };
+
+    return (ioctl(channel->fd, NVHOST_IOCTL_CHANNEL_SET_TIMEOUT, &args) < 0) ? AVERROR(errno) : 0;
+#else
+    return AVERROR(nvioctlChannel_SetSubmitTimeout(channel->channel.fd, timeout_ms));
+#endif
+}
+
+int av_nvtegra_syncpt_wait(AVNVTegraChannel *channel, uint32_t threshold, int32_t timeout) {
+#ifndef __SWITCH__
+    struct nvhost_ctrl_syncpt_waitex_args args = {
+        .id      = channel->syncpt,
+        .thresh  = threshold,
+        .timeout = timeout,
+    };
+
+    return (ioctl(get_nvhost_fd(), NVHOST_IOCTL_CTRL_SYNCPT_WAITEX, &args) < 0) ? AVERROR(errno) : 0;
+#else
+    NvFence fence;
+
+    fence = (NvFence){
+        .id    = channel->syncpt,
+        .value = threshold,
+    };
+
+    return AVERROR(nvFenceWait(&fence, timeout));
+#endif
+}
+
+#ifdef __SWITCH__
+static inline bool convert_cache_flags(uint32_t flags) {
+    /* Return whether the map should be CPU-cacheable */
+    switch (flags & NVMAP_HANDLE_CACHE_FLAG) {
+        case NVMAP_HANDLE_INNER_CACHEABLE:
+        case NVMAP_HANDLE_CACHEABLE:
+            return true;
+        default:
+            return false;
+    }
+}
+#endif
+
+int av_nvtegra_map_allocate(AVNVTegraMap *map, AVNVTegraChannel *channel, uint32_t size,
+                            uint32_t align, int heap_mask, int flags)
+{
+#ifndef __SWITCH__
+    struct nvmap_create_handle create_args;
+    struct nvmap_alloc_handle alloc_args;
+    int err;
+
+    create_args = (struct nvmap_create_handle){
+        .size   = size,
+    };
+
+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_CREATE, &create_args);
+    if (err < 0)
+        return AVERROR(errno);
+
+    map->size   = size;
+    map->handle = create_args.handle;
+
+    alloc_args = (struct nvmap_alloc_handle){
+        .handle    = create_args.handle,
+        .heap_mask = heap_mask,
+        .flags     = flags | (MEM_TAG << 16),
+        .align     = align,
+    };
+
+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_ALLOC, &alloc_args);
+    if (err < 0)
+        goto fail;
+
+    return 0;
+
+fail:
+    av_nvtegra_map_free(map);
+    return AVERROR(errno);
+#else
+    void *mem;
+
+    map->owner = channel->channel.fd;
+
+    size = FFALIGN(size, 0x1000);
+
+    mem = aligned_alloc(FFALIGN(align, 0x1000), size);
+    if (!mem)
+        return AVERROR(ENOMEM);
+
+    return AVERROR(nvMapCreate(&map->map, mem, size, 0x10000, NvKind_Pitch,
+                               convert_cache_flags(flags)));
+#endif
+}
+
+int av_nvtegra_map_free(AVNVTegraMap *map) {
+#ifndef __SWITCH__
+    int err;
+
+    if (!map->handle)
+        return 0;
+
+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_FREE, map->handle);
+    if (err < 0)
+        return AVERROR(errno);
+
+    map->handle = 0;
+
+    return 0;
+#else
+    void *addr = map->map.cpu_addr;
+
+    if (!map->map.cpu_addr)
+        return 0;
+
+    nvMapClose(&map->map);
+    free(addr);
+    return 0;
+#endif
+}
+
+int av_nvtegra_map_from_va(AVNVTegraMap *map, AVNVTegraChannel *owner, void *mem,
+                           uint32_t size, uint32_t align, uint32_t flags)
+{
+#ifndef __SWITCH__
+    struct nvmap_create_handle_from_va args;
+    int err;
+
+    args = (struct nvmap_create_handle_from_va){
+        .va    = (uintptr_t)mem,
+        .size  = size,
+        .flags = flags | (MEM_TAG << 16),
+    };
+
+    err = ioctl(get_nvmap_fd(), NVMAP_IOC_FROM_VA, &args);
+    if (err < 0)
+        return AVERROR(errno);
+
+    map->cpu_addr = mem;
+    map->size     = size;
+    map->handle   = args.handle;
+
+    return 0;
+#else
+
+    map->owner = owner->channel.fd;
+
+    return AVERROR(nvMapCreate(&map->map, mem, FFALIGN(size, 0x1000), 0x10000, NvKind_Pitch,
+                               convert_cache_flags(flags)));;
+#endif
+}
+
+int av_nvtegra_map_close(AVNVTegraMap *map) {
+#ifndef __SWITCH__
+    return av_nvtegra_map_free(map);
+#else
+    nvMapClose(&map->map);
+    return 0;
+#endif
+}
+
+int av_nvtegra_map_map(AVNVTegraMap *map) {
+#ifndef __SWITCH__
+    void *addr;
+
+    addr = mmap(NULL, map->size, PROT_READ | PROT_WRITE, MAP_SHARED, map->handle, 0);
+    if (addr == MAP_FAILED)
+        return AVERROR(errno);
+
+    map->cpu_addr = addr;
+
+    return 0;
+#else
+    nvioctl_command_buffer_map params;
+    int err;
+
+    params = (nvioctl_command_buffer_map){
+        .handle = map->map.handle,
+    };
+
+    err = nvioctlChannel_MapCommandBuffer(map->owner, &params, 1, false);
+    if (R_FAILED(err))
+        return AVERROR(err);
+
+    map->iova = params.iova;
+
+    return 0;
+#endif
+}
+
+int av_nvtegra_map_unmap(AVNVTegraMap *map) {
+    int err;
+#ifndef __SWITCH__
+    if (!map->cpu_addr)
+        return 0;
+
+    err = munmap(map->cpu_addr, map->size);
+    if (err < 0)
+        return AVERROR(errno);
+
+    map->cpu_addr = NULL;
+
+    return 0;
+#else
+    nvioctl_command_buffer_map params;
+
+    if (!map->iova)
+        return 0;
+
+    params = (nvioctl_command_buffer_map){
+        .handle = map->map.handle,
+        .iova   = map->iova,
+    };
+
+    err = nvioctlChannel_UnmapCommandBuffer(map->owner, &params, 1, false);
+    if (R_FAILED(err))
+        return AVERROR(err);
+
+    map->iova = 0;
+
+    return 0;
+#endif
+}
+
+int av_nvtegra_map_cache_op(AVNVTegraMap *map, int op, void *addr, size_t len) {
+#ifndef __SWITCH__
+    struct nvmap_cache_op args;
+
+    args = (struct nvmap_cache_op){
+        .addr   = (uintptr_t)addr,
+        .len    = len,
+        .handle = av_nvtegra_map_get_handle(map),
+        .op     = op,
+    };
+
+    return AVERROR(ioctl(get_nvmap_fd(), NVMAP_IOC_CACHE, &args));
+#else
+    if (!map->map.is_cpu_cacheable)
+        return 0;
+
+    switch (op) {
+        case NVMAP_CACHE_OP_WB:
+            armDCacheClean(addr, len);
+            break;
+        default:
+        case NVMAP_CACHE_OP_INV:
+        case NVMAP_CACHE_OP_WB_INV:
+            /* libnx internally performs a clean-invalidate, since invalidate is a privileged instruction */
+            armDCacheFlush(addr, len);
+            break;
+    }
+
+    return 0;
+#endif
+}
+
+int av_nvtegra_map_realloc(AVNVTegraMap *map, uint32_t size, uint32_t align,
+                           int heap_mask, int flags)
+{
+    AVNVTegraChannel channel;
+    AVNVTegraMap tmp = {0};
+    int err;
+
+    if (av_nvtegra_map_get_size(map) >= size)
+        return 0;
+
+    /* Dummy channel object to hold the owner fd */
+    channel = (AVNVTegraChannel){
+#ifdef __SWITCH__
+        .channel.fd = map->owner,
+#endif
+    };
+
+    err = av_nvtegra_map_create(&tmp, &channel, size, align, heap_mask, flags);
+    if (err < 0)
+        goto fail;
+
+    memcpy(av_nvtegra_map_get_addr(&tmp), av_nvtegra_map_get_addr(map), av_nvtegra_map_get_size(map));
+
+    err = av_nvtegra_map_destroy(map);
+    if (err < 0)
+        goto fail;
+
+    *map = tmp;
+
+    return 0;
+
+fail:
+    av_nvtegra_map_destroy(&tmp);
+    return err;
+}
+
+int av_nvtegra_cmdbuf_init(AVNVTegraCmdbuf *cmdbuf) {
+    cmdbuf->num_cmdbufs      = 0;
+#ifndef __SWITCH__
+    cmdbuf->num_relocs       = 0;
+    cmdbuf->num_waitchks     = 0;
+#endif
+    cmdbuf->num_syncpt_incrs = 0;
+
+#define NUM_INITIAL_CMDBUFS      3
+#define NUM_INITIAL_RELOCS       15
+#define NUM_INITIAL_SYNCPT_INCRS 3
+
+    cmdbuf->cmdbufs      = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->cmdbufs));
+#ifndef __SWITCH__
+    cmdbuf->cmdbuf_exts  = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->cmdbuf_exts));
+    cmdbuf->class_ids    = av_malloc_array(NUM_INITIAL_CMDBUFS, sizeof(*cmdbuf->class_ids));
+#endif
+
+#ifndef __SWITCH__
+    if (!cmdbuf->cmdbufs || !cmdbuf->cmdbuf_exts || !cmdbuf->class_ids)
+#else
+    if (!cmdbuf->cmdbufs)
+#endif
+        return AVERROR(ENOMEM);
+
+#ifndef __SWITCH__
+    cmdbuf->relocs       = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->relocs));
+    cmdbuf->reloc_types  = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->reloc_types));
+    cmdbuf->reloc_shifts = av_malloc_array(NUM_INITIAL_RELOCS, sizeof(*cmdbuf->reloc_shifts));
+    if (!cmdbuf->relocs || !cmdbuf->reloc_types || !cmdbuf->reloc_shifts)
+        return AVERROR(ENOMEM);
+#endif
+
+    cmdbuf->syncpt_incrs = av_malloc_array(NUM_INITIAL_SYNCPT_INCRS, sizeof(*cmdbuf->syncpt_incrs));
+#ifndef __SWITCH__
+    cmdbuf->fences       = av_malloc_array(NUM_INITIAL_SYNCPT_INCRS, sizeof(*cmdbuf->fences));
+#endif
+
+#ifndef __SWITCH__
+    if (!cmdbuf->syncpt_incrs || !cmdbuf->fences)
+#else
+    if (!cmdbuf->syncpt_incrs)
+#endif
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_deinit(AVNVTegraCmdbuf *cmdbuf) {
+    av_freep(&cmdbuf->cmdbufs);
+    av_freep(&cmdbuf->syncpt_incrs);
+
+#ifndef __SWITCH__
+    av_freep(&cmdbuf->cmdbuf_exts), av_freep(&cmdbuf->class_ids);
+    av_freep(&cmdbuf->relocs), av_freep(&cmdbuf->reloc_types), av_freep(&cmdbuf->reloc_shifts);
+    av_freep(&cmdbuf->fences);
+#endif
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_add_memory(AVNVTegraCmdbuf *cmdbuf, AVNVTegraMap *map, uint32_t offset, uint32_t size) {
+    uint8_t *mem;
+
+    mem = av_nvtegra_map_get_addr(map);
+
+    cmdbuf->map        = map;
+    cmdbuf->mem_offset = offset;
+    cmdbuf->mem_size   = size;
+
+    cmdbuf->cur_word = (uint32_t *)(mem + cmdbuf->mem_offset);
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_clear(AVNVTegraCmdbuf *cmdbuf) {
+    uint8_t *mem;
+
+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
+
+    cmdbuf->num_cmdbufs = 0, cmdbuf->num_syncpt_incrs = 0;
+#ifndef __SWITCH__
+    cmdbuf->num_relocs = 0, cmdbuf->num_waitchks = 0;
+#endif
+
+    cmdbuf->cur_word = (uint32_t *)(mem + cmdbuf->mem_offset);
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_begin(AVNVTegraCmdbuf *cmdbuf, uint32_t class_id) {
+    uint8_t *mem;
+    void *tmp1;
+#ifndef __SWITCH__
+    void *tmp2, *tmp3;
+#endif
+
+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
+
+    tmp1 = av_realloc_array(cmdbuf->cmdbufs,     cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->cmdbufs));
+#ifndef __SWITCH__
+    tmp2 = av_realloc_array(cmdbuf->cmdbuf_exts, cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->cmdbuf_exts));
+    tmp3 = av_realloc_array(cmdbuf->class_ids,   cmdbuf->num_cmdbufs + 1, sizeof(*cmdbuf->class_ids));
+#endif
+
+#ifndef __SWITCH__
+    if (!tmp1 || !tmp2 || !tmp3)
+#else
+    if (!tmp1)
+#endif
+        return AVERROR(ENOMEM);
+
+    cmdbuf->cmdbufs = tmp1;
+
+#ifndef __SWITCH__
+    cmdbuf->cmdbuf_exts = tmp2, cmdbuf->class_ids = tmp3;
+#endif
+
+    cmdbuf->cmdbufs[cmdbuf->num_cmdbufs] = (struct nvhost_cmdbuf){
+        .mem       = av_nvtegra_map_get_handle(cmdbuf->map),
+        .offset    = (uint8_t *)cmdbuf->cur_word - mem,
+    };
+
+#ifndef __SWITCH__
+    cmdbuf->cmdbuf_exts[cmdbuf->num_cmdbufs] = (struct nvhost_cmdbuf_ext){
+        .pre_fence = -1,
+    };
+
+    cmdbuf->class_ids[cmdbuf->num_cmdbufs] = class_id;
+#endif
+
+#ifdef __SWITCH__
+    if (cmdbuf->num_cmdbufs == 0)
+        av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_setclass(class_id, 0, 0));
+#endif
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_end(AVNVTegraCmdbuf *cmdbuf) {
+    cmdbuf->num_cmdbufs++;
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_push_word(AVNVTegraCmdbuf *cmdbuf, uint32_t word) {
+    uintptr_t mem_start = (uintptr_t)av_nvtegra_map_get_addr(cmdbuf->map) + cmdbuf->mem_offset;
+
+    if ((uintptr_t)cmdbuf->cur_word - mem_start >= cmdbuf->mem_size)
+        return AVERROR(ENOMEM);
+
+    *cmdbuf->cur_word++ = word;
+    cmdbuf->cmdbufs[cmdbuf->num_cmdbufs].words += 1;
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_push_value(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, uint32_t word) {
+    int err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_incr(NV_THI_METHOD0>>2, 2));
+    if (err < 0)
+        return err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, offset);
+    if (err < 0)
+        return err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, word);
+    if (err < 0)
+        return err;
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_push_reloc(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, AVNVTegraMap *target, uint32_t target_offset,
+                                 int reloc_type, int shift)
+{
+    int err;
+#ifndef __SWITCH__
+    uint8_t *mem;
+    void *tmp1, *tmp2, *tmp3;
+
+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
+
+    tmp1 = av_realloc_array(cmdbuf->relocs,       cmdbuf->num_relocs + 1, sizeof(*cmdbuf->relocs));
+    tmp2 = av_realloc_array(cmdbuf->reloc_types,  cmdbuf->num_relocs + 1, sizeof(*cmdbuf->reloc_types));
+    tmp3 = av_realloc_array(cmdbuf->reloc_shifts, cmdbuf->num_relocs + 1, sizeof(*cmdbuf->reloc_shifts));
+    if (!tmp1 || !tmp2 || !tmp3)
+        return AVERROR(ENOMEM);
+
+    cmdbuf->relocs = tmp1, cmdbuf->reloc_types = tmp2, cmdbuf->reloc_shifts = tmp3;
+
+    err = av_nvtegra_cmdbuf_push_value(cmdbuf, offset, 0xdeadbeef);
+    if (err < 0)
+        return err;
+
+    cmdbuf->relocs[cmdbuf->num_relocs]       = (struct nvhost_reloc){
+        .cmdbuf_mem    = av_nvtegra_map_get_handle(cmdbuf->map),
+        .cmdbuf_offset = (uint8_t *)cmdbuf->cur_word - mem - sizeof(uint32_t),
+        .target        = av_nvtegra_map_get_handle(target),
+        .target_offset = target_offset,
+    };
+
+    cmdbuf->reloc_types[cmdbuf->num_relocs]  = (struct nvhost_reloc_type){
+        .reloc_type    = reloc_type,
+    };
+
+    cmdbuf->reloc_shifts[cmdbuf->num_relocs] = (struct nvhost_reloc_shift){
+        .shift         = shift,
+    };
+
+    cmdbuf->num_relocs++;
+
+    return 0;
+#else
+    err = av_nvtegra_cmdbuf_push_value(cmdbuf, offset, (target->iova + target_offset) >> shift);
+    if (err < 0)
+        return err;
+
+    return 0;
+#endif
+}
+
+int av_nvtegra_cmdbuf_push_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt) {
+    int err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_nonincr(NV_THI_INCR_SYNCPT>>2, 1));
+    if (err < 0)
+        return err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf,
+                                      AV_NVTEGRA_VALUE(NV_THI_INCR_SYNCPT, INDX, syncpt) |
+                                      AV_NVTEGRA_ENUM (NV_THI_INCR_SYNCPT, COND, OP_DONE));
+    if (err < 0)
+        return err;
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_push_wait(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence) {
+    int err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_setclass(HOST1X_CLASS_HOST1X, 0, 0));
+    if (err < 0)
+        return err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, host1x_opcode_mask(NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD>>2,
+                                      (1<<(NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD - NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD)) |
+                                      (1<<(NV_CLASS_HOST_WAIT_SYNCPT         - NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD))));
+    if (err < 0)
+        return err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, fence);
+    if (err < 0)
+        return err;
+
+    err = av_nvtegra_cmdbuf_push_word(cmdbuf, syncpt);
+    if (err < 0)
+        return err;
+
+    return 0;
+}
+
+int av_nvtegra_cmdbuf_add_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence)
+{
+    void *tmp1;
+#ifndef __SWITCH__
+    void *tmp2;
+#endif
+
+    tmp1 = av_realloc_array(cmdbuf->syncpt_incrs, cmdbuf->num_syncpt_incrs + 1, sizeof(*cmdbuf->syncpt_incrs));
+#ifndef __SWITCH__
+    tmp2 = av_realloc_array(cmdbuf->fences,       cmdbuf->num_syncpt_incrs + 1, sizeof(*cmdbuf->fences));
+#endif
+
+#ifndef __SWITCH__
+    if (!tmp1 || !tmp2)
+#else
+    if (!tmp1)
+#endif
+        return AVERROR(ENOMEM);
+
+    cmdbuf->syncpt_incrs = tmp1;
+#ifndef __SWITCH__
+    cmdbuf->fences       = tmp2;
+#endif
+
+    cmdbuf->syncpt_incrs[cmdbuf->num_syncpt_incrs] = (struct nvhost_syncpt_incr){
+        .syncpt_id    = syncpt,
+        .syncpt_incrs = 1,
+    };
+
+#ifndef __SWITCH__
+    cmdbuf->fences[cmdbuf->num_syncpt_incrs]       = fence;
+#endif
+
+    cmdbuf->num_syncpt_incrs++;
+
+    return av_nvtegra_cmdbuf_push_syncpt_incr(cmdbuf, syncpt);
+}
+
+int av_nvtegra_cmdbuf_add_waitchk(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence) {
+#ifndef __SWITCH__
+    uint8_t *mem;
+    void *tmp;
+
+    mem = av_nvtegra_map_get_addr(cmdbuf->map);
+
+    tmp = av_realloc_array(cmdbuf->waitchks, cmdbuf->num_waitchks + 1, sizeof(*cmdbuf->waitchks));
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    cmdbuf->waitchks = tmp;
+
+    cmdbuf->waitchks[cmdbuf->num_waitchks] = (struct nvhost_waitchk){
+        .mem       = av_nvtegra_map_get_handle(cmdbuf->map),
+        .offset    = (uint8_t *)cmdbuf->cur_word - mem - sizeof(uint32_t),
+        .syncpt_id = syncpt,
+        .thresh    = fence,
+    };
+
+    cmdbuf->num_waitchks++;
+#endif
+
+    return av_nvtegra_cmdbuf_push_wait(cmdbuf, syncpt, fence);
+}
+
+static void nvtegra_job_free(void *opaque, uint8_t *data) {
+    AVNVTegraJob *job = (AVNVTegraJob *)data;
+
+    if (!job)
+        return;
+
+    av_nvtegra_cmdbuf_deinit(&job->cmdbuf);
+    av_nvtegra_map_destroy(&job->input_map);
+
+    av_freep(&job);
+}
+
+static AVBufferRef *nvtegra_job_alloc(void *opaque, size_t size) {
+    AVNVTegraJobPool *pool = opaque;
+
+    AVBufferRef  *buffer;
+    AVNVTegraJob *job;
+    int err;
+
+    job = av_mallocz(sizeof(*job));
+    if (!job)
+        return NULL;
+
+    err = av_nvtegra_map_create(&job->input_map, pool->channel, pool->input_map_size, 0x100,
+                                NVMAP_HEAP_IOVMM, NVMAP_HANDLE_WRITE_COMBINE);
+    if (err < 0)
+        goto fail;
+
+    err = av_nvtegra_cmdbuf_init(&job->cmdbuf);
+    if (err < 0)
+        goto fail;
+
+    err = av_nvtegra_cmdbuf_add_memory(&job->cmdbuf, &job->input_map, pool->cmdbuf_off, pool->max_cmdbuf_size);
+    if (err < 0)
+        goto fail;
+
+    buffer = av_buffer_create((uint8_t *)job, sizeof(*job), nvtegra_job_free, pool, 0);
+    if (!buffer)
+        goto fail;
+
+    return buffer;
+
+fail:
+    av_nvtegra_cmdbuf_deinit(&job->cmdbuf);
+    av_nvtegra_map_destroy(&job->input_map);
+    av_freep(job);
+    return NULL;
+}
+
+int av_nvtegra_job_pool_init(AVNVTegraJobPool *pool, AVNVTegraChannel *channel,
+                             size_t input_map_size, off_t cmdbuf_off, size_t max_cmdbuf_size)
+{
+    pool->channel         = channel;
+    pool->input_map_size  = input_map_size;
+    pool->cmdbuf_off      = cmdbuf_off;
+    pool->max_cmdbuf_size = max_cmdbuf_size;
+    pool->pool            = av_buffer_pool_init2(sizeof(AVNVTegraJob), pool,
+                                                 nvtegra_job_alloc, NULL);
+    if (!pool->pool)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+int av_nvtegra_job_pool_uninit(AVNVTegraJobPool *pool) {
+    av_buffer_pool_uninit(&pool->pool);
+    return 0;
+}
+
+AVBufferRef *av_nvtegra_job_pool_get(AVNVTegraJobPool *pool) {
+    return av_buffer_pool_get(pool->pool);
+}
+
+int av_nvtegra_job_submit(AVNVTegraJobPool *pool, AVNVTegraJob *job) {
+    return av_nvtegra_channel_submit(pool->channel, &job->cmdbuf, &job->fence);
+}
+
+int av_nvtegra_job_wait(AVNVTegraJobPool *pool, AVNVTegraJob *job, int timeout) {
+    return av_nvtegra_syncpt_wait(pool->channel, job->fence, timeout);
+}
diff --git a/libavutil/nvtegra.h b/libavutil/nvtegra.h
new file mode 100644
index 0000000000..3b63335d6c
--- /dev/null
+++ b/libavutil/nvtegra.h
@@ -0,0 +1,258 @@ 
+/*
+ * Copyright (c) 2024 averne <averne381@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef AVUTIL_NVTEGRA_H
+#define AVUTIL_NVTEGRA_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "buffer.h"
+
+#include "nvhost_ioctl.h"
+#include "nvmap_ioctl.h"
+
+typedef struct AVNVTegraChannel {
+#ifndef __SWITCH__
+    int fd;
+    int module_id;
+#else
+    NvChannel channel;
+#endif
+
+    uint32_t syncpt;
+
+#ifdef __SWITCH__
+    MmuRequest mmu_request;
+#endif
+    uint32_t clock;
+} AVNVTegraChannel;
+
+typedef struct AVNVTegraMap {
+#ifndef __SWITCH__
+    uint32_t handle;
+    uint32_t size;
+    void *cpu_addr;
+#else
+    NvMap map;
+    uint32_t iova;
+    uint32_t owner;
+#endif
+    bool is_linear;
+} AVNVTegraMap;
+
+typedef struct AVNVTegraCmdbuf {
+    AVNVTegraMap *map;
+
+    uint32_t mem_offset, mem_size;
+
+    uint32_t *cur_word;
+
+    struct nvhost_cmdbuf       *cmdbufs;
+#ifndef __SWITCH__
+    struct nvhost_cmdbuf_ext   *cmdbuf_exts;
+    uint32_t                   *class_ids;
+#endif
+    uint32_t num_cmdbufs;
+
+#ifndef __SWITCH__
+    struct nvhost_reloc        *relocs;
+    struct nvhost_reloc_type   *reloc_types;
+    struct nvhost_reloc_shift  *reloc_shifts;
+    uint32_t num_relocs;
+#endif
+
+    struct nvhost_syncpt_incr  *syncpt_incrs;
+#ifndef __SWITCH__
+    uint32_t                   *fences;
+#endif
+    uint32_t num_syncpt_incrs;
+
+#ifndef __SWITCH__
+    struct nvhost_waitchk      *waitchks;
+    uint32_t num_waitchks;
+#endif
+} AVNVTegraCmdbuf;
+
+typedef struct AVNVTegraJobPool {
+    /*
+     * Pool object for job allocation
+     */
+    AVBufferPool *pool;
+
+    /*
+     * Hardware channel the jobs will be submitted to
+     */
+    AVNVTegraChannel *channel;
+
+    /*
+     * Total size of the input memory-mapped buffer
+     */
+    size_t input_map_size;
+
+    /*
+     * Offset of the command data within the input map
+     */
+    off_t cmdbuf_off;
+
+    /*
+     * Maximum memory usable by the command buffer
+     */
+    size_t max_cmdbuf_size;
+} AVNVTegraJobPool;
+
+typedef struct AVNVTegraJob {
+    /*
+     * Memory-mapped buffer for command buffers, metadata structures, ...
+     */
+    AVNVTegraMap input_map;
+
+    /*
+     * Object for command recording
+     */
+    AVNVTegraCmdbuf cmdbuf;
+
+    /*
+     * Fence indicating completion of the job
+     */
+    uint32_t fence;
+} AVNVTegraJob;
+
+AVBufferRef *av_nvtegra_driver_init(void);
+
+int av_nvtegra_channel_open(AVNVTegraChannel *channel, const char *dev);
+int av_nvtegra_channel_close(AVNVTegraChannel *channel);
+int av_nvtegra_channel_get_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t *clock_rate);
+int av_nvtegra_channel_set_clock_rate(AVNVTegraChannel *channel, uint32_t moduleid, uint32_t clock_rate);
+int av_nvtegra_channel_submit(AVNVTegraChannel *channel, AVNVTegraCmdbuf *cmdbuf, uint32_t *fence);
+int av_nvtegra_channel_set_submit_timeout(AVNVTegraChannel *channel, uint32_t timeout_ms);
+
+int av_nvtegra_syncpt_wait(AVNVTegraChannel *channel, uint32_t threshold, int32_t timeout);
+
+int av_nvtegra_map_allocate(AVNVTegraMap *map, AVNVTegraChannel *owner, uint32_t size,
+                            uint32_t align, int heap_mask, int flags);
+int av_nvtegra_map_free(AVNVTegraMap *map);
+int av_nvtegra_map_from_va(AVNVTegraMap *map, AVNVTegraChannel *owner, void *mem,
+                           uint32_t size, uint32_t align, uint32_t flags);
+int av_nvtegra_map_close(AVNVTegraMap *map);
+int av_nvtegra_map_map(AVNVTegraMap *map);
+int av_nvtegra_map_unmap(AVNVTegraMap *map);
+int av_nvtegra_map_cache_op(AVNVTegraMap *map, int op, void *addr, size_t len);
+int av_nvtegra_map_realloc(AVNVTegraMap *map, uint32_t size, uint32_t align, int heap_mask, int flags);
+
+static inline int av_nvtegra_map_create(AVNVTegraMap *map, AVNVTegraChannel *owner, uint32_t size, uint32_t align,
+                                        int heap_mask, int flags)
+{
+    int err;
+
+    err = av_nvtegra_map_allocate(map, owner, size, align, heap_mask, flags);
+    if (err < 0)
+        return err;
+
+    return av_nvtegra_map_map(map);
+}
+
+static inline int av_nvtegra_map_destroy(AVNVTegraMap *map) {
+    int err;
+
+    err = av_nvtegra_map_unmap(map);
+    if (err < 0)
+        return err;
+
+    return av_nvtegra_map_free(map);
+}
+
+int av_nvtegra_cmdbuf_init(AVNVTegraCmdbuf *cmdbuf);
+int av_nvtegra_cmdbuf_deinit(AVNVTegraCmdbuf *cmdbuf);
+int av_nvtegra_cmdbuf_add_memory(AVNVTegraCmdbuf *cmdbuf, AVNVTegraMap *map, uint32_t offset, uint32_t size);
+int av_nvtegra_cmdbuf_clear(AVNVTegraCmdbuf *cmdbuf);
+int av_nvtegra_cmdbuf_begin(AVNVTegraCmdbuf *cmdbuf, uint32_t class_id);
+int av_nvtegra_cmdbuf_end(AVNVTegraCmdbuf *cmdbuf);
+int av_nvtegra_cmdbuf_push_word(AVNVTegraCmdbuf *cmdbuf, uint32_t word);
+int av_nvtegra_cmdbuf_push_value(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, uint32_t word);
+int av_nvtegra_cmdbuf_push_reloc(AVNVTegraCmdbuf *cmdbuf, uint32_t offset, AVNVTegraMap *target, uint32_t target_offset,
+                                 int reloc_type, int shift);
+int av_nvtegra_cmdbuf_push_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt);
+int av_nvtegra_cmdbuf_push_wait(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
+int av_nvtegra_cmdbuf_add_syncpt_incr(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
+int av_nvtegra_cmdbuf_add_waitchk(AVNVTegraCmdbuf *cmdbuf, uint32_t syncpt, uint32_t fence);
+
+/*
+ * Job allocation and submission routines
+ */
+int av_nvtegra_job_pool_init(AVNVTegraJobPool *pool, AVNVTegraChannel *channel,
+                             size_t input_map_size, off_t cmdbuf_off, size_t max_cmdbuf_size);
+int av_nvtegra_job_pool_uninit(AVNVTegraJobPool *pool);
+AVBufferRef *av_nvtegra_job_pool_get(AVNVTegraJobPool *pool);
+
+int av_nvtegra_job_submit(AVNVTegraJobPool *pool, AVNVTegraJob *job);
+int av_nvtegra_job_wait(AVNVTegraJobPool *pool, AVNVTegraJob *job, int timeout);
+
+static inline uint32_t av_nvtegra_map_get_handle(AVNVTegraMap *map) {
+#ifndef __SWITCH__
+    return map->handle;
+#else
+    return map->map.handle;
+#endif
+}
+
+static inline void *av_nvtegra_map_get_addr(AVNVTegraMap *map) {
+#ifndef __SWITCH__
+    return map->cpu_addr;
+#else
+    return map->map.cpu_addr;
+#endif
+}
+
+static inline uint32_t av_nvtegra_map_get_size(AVNVTegraMap *map) {
+#ifndef __SWITCH__
+    return map->size;
+#else
+    return map->map.size;
+#endif
+}
+
+/* Addresses are shifted by 8 bits in the command buffer, requiring an alignment to 256 */
+#define AV_NVTEGRA_MAP_ALIGN (1 << 8)
+
+#define AV_NVTEGRA_VALUE(offset, field, value)                                                    \
+    ((value &                                                                                     \
+    ((uint32_t)((UINT64_C(1) << ((1?offset ## _ ## field) - (0?offset ## _ ## field) + 1)) - 1))) \
+    << (0?offset ## _ ## field))
+
+#define AV_NVTEGRA_ENUM(offset, field, value)                                                     \
+    ((offset ## _ ## field ## _ ## value &                                                        \
+    ((uint32_t)((UINT64_C(1) << ((1?offset ## _ ## field) - (0?offset ## _ ## field) + 1)) - 1))) \
+    << (0?offset ## _ ## field))
+
+#define AV_NVTEGRA_PUSH_VALUE(cmdbuf, offset, value) ({                                  \
+    int _err = av_nvtegra_cmdbuf_push_value(cmdbuf, (offset) / sizeof(uint32_t), value); \
+    if (_err < 0)                                                                        \
+        return _err;                                                                     \
+})
+
+#define AV_NVTEGRA_PUSH_RELOC(cmdbuf, offset, target, target_offset, type) ({    \
+    int _err = av_nvtegra_cmdbuf_push_reloc(cmdbuf, (offset) / sizeof(uint32_t), \
+                                            target, target_offset, type, 8);     \
+    if (_err < 0)                                                                \
+        return _err;                                                             \
+})
+
+#endif /* AVUTIL_NVTEGRA_H */
diff --git a/libavutil/nvtegra_host1x.h b/libavutil/nvtegra_host1x.h
new file mode 100644
index 0000000000..25e37eae61
--- /dev/null
+++ b/libavutil/nvtegra_host1x.h
@@ -0,0 +1,94 @@ 
+/*
+ * Copyright (c) 2024 averne <averne381@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef AVUTIL_NVTEGRA_HOST1X_H
+#define AVUTIL_NVTEGRA_HOST1X_H
+
+#include <stdint.h>
+
+#include "macros.h"
+
+/* From L4T include/linux/host1x.h */
+enum host1x_class {
+    HOST1X_CLASS_HOST1X  = 0x01,
+    HOST1X_CLASS_NVENC   = 0x21,
+    HOST1X_CLASS_VI      = 0x30,
+    HOST1X_CLASS_ISPA    = 0x32,
+    HOST1X_CLASS_ISPB    = 0x34,
+    HOST1X_CLASS_GR2D    = 0x51,
+    HOST1X_CLASS_GR2D_SB = 0x52,
+    HOST1X_CLASS_VIC     = 0x5d,
+    HOST1X_CLASS_GR3D    = 0x60,
+    HOST1X_CLASS_NVJPG   = 0xc0,
+    HOST1X_CLASS_NVDEC   = 0xf0,
+};
+
+static inline uint32_t host1x_opcode_setclass(unsigned class_id, unsigned offset, unsigned mask) {
+    return (0 << 28) | (offset << 16) | (class_id << 6) | mask;
+}
+
+static inline uint32_t host1x_opcode_incr(unsigned offset, unsigned count) {
+    return (1 << 28) | (offset << 16) | count;
+}
+
+static inline uint32_t host1x_opcode_nonincr(unsigned offset, unsigned count) {
+    return (2 << 28) | (offset << 16) | count;
+}
+
+static inline uint32_t host1x_opcode_mask(unsigned offset, unsigned mask) {
+    return (3 << 28) | (offset << 16) | mask;
+}
+
+static inline uint32_t host1x_opcode_imm(unsigned offset, unsigned value) {
+    return (4 << 28) | (offset << 16) | value;
+}
+
+#define NV_CLASS_HOST_LOAD_SYNCPT_PAYLOAD                                  (0x00000138)
+#define NV_CLASS_HOST_WAIT_SYNCPT                                          (0x00000140)
+
+#define NV_THI_INCR_SYNCPT                                                 (0x00000000)
+#define NV_THI_INCR_SYNCPT_INDX                                            7:0
+#define NV_THI_INCR_SYNCPT_COND                                            15:8
+#define NV_THI_INCR_SYNCPT_COND_IMMEDIATE                                  (0x00000000)
+#define NV_THI_INCR_SYNCPT_COND_OP_DONE                                    (0x00000001)
+#define NV_THI_INCR_SYNCPT_ERR                                             (0x00000008)
+#define NV_THI_INCR_SYNCPT_ERR_COND_STS_IMM                                0:0
+#define NV_THI_INCR_SYNCPT_ERR_COND_STS_OPDONE                             1:1
+#define NV_THI_CTXSW_INCR_SYNCPT                                           (0x0000000c)
+#define NV_THI_CTXSW_INCR_SYNCPT_INDX                                      7:0
+#define NV_THI_CTXSW                                                       (0x00000020)
+#define NV_THI_CTXSW_CURR_CLASS                                            9:0
+#define NV_THI_CTXSW_AUTO_ACK                                              11:11
+#define NV_THI_CTXSW_CURR_CHANNEL                                          15:12
+#define NV_THI_CTXSW_NEXT_CLASS                                            25:16
+#define NV_THI_CTXSW_NEXT_CHANNEL                                          31:28
+#define NV_THI_CONT_SYNCPT_EOF                                             (0x00000028)
+#define NV_THI_CONT_SYNCPT_EOF_INDEX                                       7:0
+#define NV_THI_CONT_SYNCPT_EOF_COND                                        8:8
+#define NV_THI_METHOD0                                                     (0x00000040)
+#define NV_THI_METHOD0_OFFSET                                              11:0
+#define NV_THI_METHOD1                                                     (0x00000044)
+#define NV_THI_METHOD1_DATA                                                31:0
+#define NV_THI_INT_STATUS                                                  (0x00000078)
+#define NV_THI_INT_STATUS_FALCON_INT                                       0:0
+#define NV_THI_INT_MASK                                                    (0x0000007c)
+#define NV_THI_INT_MASK_FALCON_INT                                         0:0
+
+#endif /* AVUTIL_NVTEGRA_HOST1X_H */
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 1c0bcf2232..bb14b1b306 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -2791,6 +2791,10 @@  static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         },
         .flags = AV_PIX_FMT_FLAG_PLANAR,
     },
+    [AV_PIX_FMT_NVTEGRA] = {
+        .name = "nvtegra",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
 };
 
 static const char * const color_range_names[] = {
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index a7f50e1690..a3213c792a 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -439,6 +439,14 @@  enum AVPixelFormat {
      */
     AV_PIX_FMT_D3D12,
 
+    /**
+     * Hardware surfaces for Tegra devices.
+     *
+     * data[0..2] points to memory-mapped buffer containing frame data
+     * buf[0] contains an AVBufferRef to an AVNTegraMap
+     */
+    AV_PIX_FMT_NVTEGRA,
+
     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };