From patchwork Wed Aug 7 21:33:28 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Lynne X-Patchwork-Id: 50932 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:d7b2:0:b0:489:2eb3:e4c4 with SMTP id dc18csp648839vqb; Wed, 7 Aug 2024 14:34:27 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCU51AaMEXdHUkNSEwFjv0bqiLlWJbLaCgLt97MVlyd4QsdRp2brj8ilSM+ia/on4WOxjSO3aDVXvzPqQfLWr6zaD+Xi+E4Fi+BumA== X-Google-Smtp-Source: AGHT+IFcYPwwbKowkLn7JVLS5tBIqbCpJJWP+ryLO8vDHqgZLpERDMO1/SEagrOksqeJsGiiPmCI X-Received: by 2002:a05:6512:3d10:b0:52c:e1cd:39b7 with SMTP id 2adb3069b0e04-530bb3663camr13233530e87.5.1723066467171; Wed, 07 Aug 2024 14:34:27 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1723066467; cv=none; d=google.com; s=arc-20160816; b=s/RgtBo6gU8rr0/zXQPKFJ8MsF52cIWNGEYZ59Yv4XZPCRDSwuAU/uXK0+LJyZfHGH 08r6Y1FzXrgnHArMCZ1rTabyY2vnbauLgpVdGWqe1CP69zUxoT70f1tsfLT+QUPQl24F xgWphiPUp1cwEKUYY49BPAut4m+3v9vogreFdeQGMpeDy1vPTQ8Ve76Jd8XZrCFzBKYO r5VcQ92RccPcHBqzGP153syGREu9kVehWWmEgyYjbH0VlJMuMHs3XgZmKc1qPHiiW74c 8OBOY8nKsoNYGbp0FQuWf64zbHg9RNvZ2mlK2VWRtk8cL9g8jU/nFCmubIX5eIzfZ18B VE0g== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:cc:reply-to:from :list-subscribe:list-help:list-post:list-archive:list-unsubscribe :list-id:precedence:subject:mime-version:references:in-reply-to :message-id:date:to:delivered-to; bh=mX+3ZgeSJzfiwmm/WmtAnV2Jx2R8KXNg1bU0u/hMfF8=; fh=nenT92/WZoU6unXd3J6UhGUdod4piddKfVtctNBOh6k=; b=rcADIC9CEpWw53s+m4/AUY8uJlud/uknaPRDmvnLaGNHXXgnwVwL/mL1f0xprc5uKR T8SmcxAoppBsKxRx+aYUKWoxi1rReONSZUxZ3i2DnWtpzQdS3qHidllhrG9n57SvdyzB z3AZvDXa45wvEqKF9GO2JUXMnYDpxWdVJetO+dmtSh5t0WnH3Tl54urGziL64NollQJV bl3pJ/l+qZ/ywZhmasKFcI9EyJ6G//PLlNhrlEZD8hcEfisdNFwwGuqUzQFFxP/J913Q lUHNz3OOVzvSwfnIZ1/WlGR8SouKNOaNnm+jImQWUVH4K8CTPe0mB4/j9lDq9MN4GlyS 3GQA==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id 2adb3069b0e04-530de4a04fbsi594578e87.404.2024.08.07.14.34.26; Wed, 07 Aug 2024 14:34:27 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3826D68DB87; Thu, 8 Aug 2024 00:34:10 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from vidala.lynne.ee (vidala.pars.ee [116.203.72.101]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 4A89E68DB7E for ; Thu, 8 Aug 2024 00:34:03 +0300 (EEST) To: ffmpeg-devel@ffmpeg.org Date: Wed, 7 Aug 2024 23:33:28 +0200 Message-ID: <20240807213347.917235-3-dev@lynne.ee> X-Mailer: git-send-email 2.45.2.753.g447d99e1c3b In-Reply-To: <20240807213347.917235-1-dev@lynne.ee> References: <20240807213347.917235-1-dev@lynne.ee> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 03/13] hwcontext_vulkan: rewrite queue picking system for the new API X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Lynne via ffmpeg-devel From: Lynne Reply-To: FFmpeg development discussions and patches Cc: Lynne Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: 5FyM90e3Ja4B This allows us to support different video ops on different queues, as well as any other arbitrary queues we need. --- libavutil/hwcontext_vulkan.c | 262 ++++++++++++++++++++++------------- 1 file changed, 167 insertions(+), 95 deletions(-) diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c index 33d856ddd3..5baf68660a 100644 --- a/libavutil/hwcontext_vulkan.c +++ b/libavutil/hwcontext_vulkan.c @@ -1028,16 +1028,51 @@ end: } /* Picks the least used qf with the fewest unneeded flags, or -1 if none found */ -static inline int pick_queue_family(VkQueueFamilyProperties *qf, uint32_t num_qf, +static inline int pick_queue_family(VkQueueFamilyProperties2 *qf, uint32_t num_qf, VkQueueFlagBits flags) { int index = -1; uint32_t min_score = UINT32_MAX; for (int i = 0; i < num_qf; i++) { - const VkQueueFlagBits qflags = qf[i].queueFlags; + VkQueueFlagBits qflags = qf[i].queueFamilyProperties.queueFlags; + + /* Per the spec, reporting transfer caps is optional for these 2 types */ + if ((flags & VK_QUEUE_TRANSFER_BIT) && + (qflags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT))) + qflags |= VK_QUEUE_TRANSFER_BIT; + if (qflags & flags) { - uint32_t score = av_popcount(qflags) + qf[i].timestampValidBits; + uint32_t score = av_popcount(qflags) + qf[i].queueFamilyProperties.timestampValidBits; + if (score < min_score) { + index = i; + min_score = score; + } + } + } + + if (index > -1) + qf[index].queueFamilyProperties.timestampValidBits++; + + return index; +} + +static inline int pick_video_queue_family(VkQueueFamilyProperties2 *qf, + VkQueueFamilyVideoPropertiesKHR *qf_vid, uint32_t num_qf, + VkVideoCodecOperationFlagBitsKHR flags) +{ + int index = -1; + uint32_t min_score = UINT32_MAX; + + for (int i = 0; i < num_qf; i++) { + const VkQueueFlagBits qflags = qf[i].queueFamilyProperties.queueFlags; + const VkQueueFlagBits vflags = qf_vid[i].videoCodecOperations; + + if (!(qflags & (VK_QUEUE_VIDEO_ENCODE_BIT_KHR | VK_QUEUE_VIDEO_DECODE_BIT_KHR))) + continue; + + if (vflags & flags) { + uint32_t score = av_popcount(vflags) + qf[i].queueFamilyProperties.timestampValidBits; if (score < min_score) { index = i; min_score = score; @@ -1046,7 +1081,7 @@ static inline int pick_queue_family(VkQueueFamilyProperties *qf, uint32_t num_qf } if (index > -1) - qf[index].timestampValidBits++; + qf[index].queueFamilyProperties.timestampValidBits++; return index; } @@ -1054,12 +1089,12 @@ static inline int pick_queue_family(VkQueueFamilyProperties *qf, uint32_t num_qf static int setup_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd) { uint32_t num; - float *weights; - VkQueueFamilyProperties *qf = NULL; VulkanDevicePriv *p = ctx->hwctx; AVVulkanDeviceContext *hwctx = &p->p; FFVulkanFunctions *vk = &p->vkctx.vkfn; - int graph_index, comp_index, tx_index, enc_index, dec_index; + + VkQueueFamilyProperties2 *qf = NULL; + VkQueueFamilyVideoPropertiesKHR *qf_vid = NULL; /* First get the number of queue families */ vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &num, NULL); @@ -1069,118 +1104,155 @@ static int setup_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd) } /* Then allocate memory */ - qf = av_malloc_array(num, sizeof(VkQueueFamilyProperties)); + qf = av_malloc_array(num, sizeof(VkQueueFamilyProperties2)); if (!qf) return AVERROR(ENOMEM); + qf_vid = av_malloc_array(num, sizeof(VkQueueFamilyVideoPropertiesKHR)); + if (!qf_vid) + return AVERROR(ENOMEM); + + for (uint32_t i = 0; i < num; i++) { + qf_vid[i] = (VkQueueFamilyVideoPropertiesKHR) { + .sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_VIDEO_PROPERTIES_KHR, + }; + qf[i] = (VkQueueFamilyProperties2) { + .sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2, + .pNext = &qf_vid[i], + }; + } + /* Finally retrieve the queue families */ - vk->GetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &num, qf); + vk->GetPhysicalDeviceQueueFamilyProperties2(hwctx->phys_dev, &num, qf); av_log(ctx, AV_LOG_VERBOSE, "Queue families:\n"); for (int i = 0; i < num; i++) { av_log(ctx, AV_LOG_VERBOSE, " %i:%s%s%s%s%s%s%s (queues: %i)\n", i, - ((qf[i].queueFlags) & VK_QUEUE_GRAPHICS_BIT) ? " graphics" : "", - ((qf[i].queueFlags) & VK_QUEUE_COMPUTE_BIT) ? " compute" : "", - ((qf[i].queueFlags) & VK_QUEUE_TRANSFER_BIT) ? " transfer" : "", - ((qf[i].queueFlags) & VK_QUEUE_VIDEO_ENCODE_BIT_KHR) ? " encode" : "", - ((qf[i].queueFlags) & VK_QUEUE_VIDEO_DECODE_BIT_KHR) ? " decode" : "", - ((qf[i].queueFlags) & VK_QUEUE_SPARSE_BINDING_BIT) ? " sparse" : "", - ((qf[i].queueFlags) & VK_QUEUE_PROTECTED_BIT) ? " protected" : "", - qf[i].queueCount); + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_GRAPHICS_BIT) ? " graphics" : "", + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_COMPUTE_BIT) ? " compute" : "", + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_TRANSFER_BIT) ? " transfer" : "", + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_VIDEO_ENCODE_BIT_KHR) ? " encode" : "", + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_VIDEO_DECODE_BIT_KHR) ? " decode" : "", + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_SPARSE_BINDING_BIT) ? " sparse" : "", + ((qf[i].queueFamilyProperties.queueFlags) & VK_QUEUE_PROTECTED_BIT) ? " protected" : "", + qf[i].queueFamilyProperties.queueCount); /* We use this field to keep a score of how many times we've used that * queue family in order to make better choices. */ - qf[i].timestampValidBits = 0; + qf[i].queueFamilyProperties.timestampValidBits = 0; } + hwctx->nb_qf = 0; + /* Pick each queue family to use */ - graph_index = pick_queue_family(qf, num, VK_QUEUE_GRAPHICS_BIT); - comp_index = pick_queue_family(qf, num, VK_QUEUE_COMPUTE_BIT); - tx_index = pick_queue_family(qf, num, VK_QUEUE_TRANSFER_BIT); - enc_index = pick_queue_family(qf, num, VK_QUEUE_VIDEO_ENCODE_BIT_KHR); - dec_index = pick_queue_family(qf, num, VK_QUEUE_VIDEO_DECODE_BIT_KHR); +#define PICK_QF(type, vid_op) \ + do { \ + uint32_t i; \ + uint32_t idx; \ + \ + if (vid_op) \ + idx = pick_video_queue_family(qf, qf_vid, num, vid_op); \ + else \ + idx = pick_queue_family(qf, num, type); \ + \ + if (idx == -1) \ + continue; \ + \ + for (i = 0; i < hwctx->nb_qf; i++) { \ + if (hwctx->qf[i].idx == idx) { \ + hwctx->qf[i].flags |= type; \ + hwctx->qf[i].video_caps |= vid_op; \ + break; \ + } \ + } \ + if (i == hwctx->nb_qf) { \ + hwctx->qf[i].idx = idx; \ + hwctx->qf[i].num = qf[idx].queueFamilyProperties.queueCount; \ + hwctx->qf[i].flags = type; \ + hwctx->qf[i].video_caps = vid_op; \ + hwctx->nb_qf++; \ + } \ + } while (0) + + PICK_QF(VK_QUEUE_GRAPHICS_BIT, VK_VIDEO_CODEC_OPERATION_NONE_KHR); + PICK_QF(VK_QUEUE_COMPUTE_BIT, VK_VIDEO_CODEC_OPERATION_NONE_KHR); + PICK_QF(VK_QUEUE_TRANSFER_BIT, VK_VIDEO_CODEC_OPERATION_NONE_KHR); + + PICK_QF(VK_QUEUE_VIDEO_ENCODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_ENCODE_H264_BIT_KHR); + PICK_QF(VK_QUEUE_VIDEO_DECODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR); + + PICK_QF(VK_QUEUE_VIDEO_ENCODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_ENCODE_H265_BIT_KHR); + PICK_QF(VK_QUEUE_VIDEO_DECODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR); + + PICK_QF(VK_QUEUE_VIDEO_DECODE_BIT_KHR, VK_VIDEO_CODEC_OPERATION_DECODE_AV1_BIT_KHR); + + av_free(qf); + av_free(qf_vid); + +#undef PICK_QF + + cd->pQueueCreateInfos = av_malloc_array(hwctx->nb_qf, + sizeof(VkDeviceQueueCreateInfo)); + if (!cd->pQueueCreateInfos) + return AVERROR(ENOMEM); + + for (uint32_t i = 0; i < hwctx->nb_qf; i++) { + int dup = 0; + float *weights = NULL; + VkDeviceQueueCreateInfo *pc; + for (uint32_t j = 0; j < cd->queueCreateInfoCount; j++) { + if (hwctx->qf[i].idx == cd->pQueueCreateInfos[j].queueFamilyIndex) { + dup = 1; + break; + } + } + if (dup) + continue; + + weights = av_malloc_array(hwctx->qf[i].num, sizeof(float)); + if (!weights) { + for (uint32_t j = 0; j < cd->queueCreateInfoCount; j++) + av_free((void *)cd->pQueueCreateInfos[i].pQueuePriorities); + av_free((void *)cd->pQueueCreateInfos); + return AVERROR(ENOMEM); + } + + for (uint32_t j = 0; j < hwctx->qf[i].num; j++) + weights[j] = 1.0; - /* Signalling the transfer capabilities on a queue family is optional */ - if (tx_index < 0) { - tx_index = pick_queue_family(qf, num, VK_QUEUE_COMPUTE_BIT); - if (tx_index < 0) - tx_index = pick_queue_family(qf, num, VK_QUEUE_GRAPHICS_BIT); + pc = (VkDeviceQueueCreateInfo *)cd->pQueueCreateInfos; + pc[cd->queueCreateInfoCount++] = (VkDeviceQueueCreateInfo) { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = hwctx->qf[i].idx, + .queueCount = hwctx->qf[i].num, + .pQueuePriorities = weights, + }; } + /* Setup deprecated fields */ hwctx->queue_family_index = -1; hwctx->queue_family_comp_index = -1; hwctx->queue_family_tx_index = -1; hwctx->queue_family_encode_index = -1; hwctx->queue_family_decode_index = -1; -#define SETUP_QUEUE(qf_idx) \ - if (qf_idx > -1) { \ - int fidx = qf_idx; \ - int qc = qf[fidx].queueCount; \ - VkDeviceQueueCreateInfo *pc; \ - \ - if (fidx == graph_index) { \ - hwctx->queue_family_index = fidx; \ - hwctx->nb_graphics_queues = qc; \ - graph_index = -1; \ - } \ - if (fidx == comp_index) { \ - hwctx->queue_family_comp_index = fidx; \ - hwctx->nb_comp_queues = qc; \ - comp_index = -1; \ - } \ - if (fidx == tx_index) { \ - hwctx->queue_family_tx_index = fidx; \ - hwctx->nb_tx_queues = qc; \ - tx_index = -1; \ - } \ - if (fidx == enc_index) { \ - hwctx->queue_family_encode_index = fidx; \ - hwctx->nb_encode_queues = qc; \ - enc_index = -1; \ - } \ - if (fidx == dec_index) { \ - hwctx->queue_family_decode_index = fidx; \ - hwctx->nb_decode_queues = qc; \ - dec_index = -1; \ - } \ - \ - pc = av_realloc((void *)cd->pQueueCreateInfos, \ - sizeof(*pc) * (cd->queueCreateInfoCount + 1)); \ - if (!pc) { \ - av_free(qf); \ - return AVERROR(ENOMEM); \ - } \ - cd->pQueueCreateInfos = pc; \ - pc = &pc[cd->queueCreateInfoCount]; \ - \ - weights = av_malloc(qc * sizeof(float)); \ - if (!weights) { \ - av_free(qf); \ - return AVERROR(ENOMEM); \ - } \ - \ - memset(pc, 0, sizeof(*pc)); \ - pc->sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; \ - pc->queueFamilyIndex = fidx; \ - pc->queueCount = qc; \ - pc->pQueuePriorities = weights; \ - \ - for (int i = 0; i < qc; i++) \ - weights[i] = 1.0f / qc; \ - \ - cd->queueCreateInfoCount++; \ - } - - SETUP_QUEUE(graph_index) - SETUP_QUEUE(comp_index) - SETUP_QUEUE(tx_index) - SETUP_QUEUE(enc_index) - SETUP_QUEUE(dec_index) - -#undef SETUP_QUEUE +#define SET_OLD_QF(field, nb_field, type) \ + do { \ + if (field < 0 && hwctx->qf[i].flags & type) { \ + field = hwctx->qf[i].idx; \ + nb_field = hwctx->qf[i].num; \ + } \ + } while (0) - av_free(qf); + for (uint32_t i = 0; i < hwctx->nb_qf; i++) { + SET_OLD_QF(hwctx->queue_family_index, hwctx->nb_graphics_queues, VK_QUEUE_GRAPHICS_BIT); + SET_OLD_QF(hwctx->queue_family_comp_index, hwctx->nb_comp_queues, VK_QUEUE_COMPUTE_BIT); + SET_OLD_QF(hwctx->queue_family_tx_index, hwctx->nb_tx_queues, VK_QUEUE_TRANSFER_BIT); + SET_OLD_QF(hwctx->queue_family_encode_index, hwctx->nb_encode_queues, VK_QUEUE_VIDEO_ENCODE_BIT_KHR); + SET_OLD_QF(hwctx->queue_family_decode_index, hwctx->nb_decode_queues, VK_QUEUE_VIDEO_DECODE_BIT_KHR); + } + +#undef SET_OLD_QF return 0; }