From patchwork Fri Nov 10 15:43:54 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 44608 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a05:6a20:4f99:b0:181:818d:5e7f with SMTP id gh25csp344515pzb; Fri, 10 Nov 2023 07:44:06 -0800 (PST) X-Google-Smtp-Source: AGHT+IEf9HYmsEbWuzXI0P0prLD0XvRre2Yf7G+EzaKVrkn0gmnNmbMu1wkJqaIDA0BjP6mQunbs X-Received: by 2002:ac2:55a3:0:b0:507:a12c:558c with SMTP id y3-20020ac255a3000000b00507a12c558cmr3979945lfg.46.1699631045975; Fri, 10 Nov 2023 07:44:05 -0800 (PST) ARC-Seal: i=1; a=rsa-sha256; t=1699631045; cv=none; d=google.com; s=arc-20160816; b=Ljv1VzsCL3xFIn2dO2iJqocEy2YK1LMPgsgu1qHd3yuNv05ZI3vfohOeYKmPS2LUL+ hR79OAdYhFwCZc6xYXmQy9xoAUooaC61yW97o7b2POlwePGVkAWHmZ7iITRsOWYTvRUJ jF0O9mKHTdUgmGq1iUltOl/4irZWtLJBJbrstYi+VouWY7FNPOkYUcanKTgyjxebMvI3 4OnKcatGYc1oxGFzJtUGndPLSo0k1pyxmqyYKZfRXK56vqIoxqbQk1uX69K9QW4FCTZx 5OZb0U2RD7xws5RmsjclmbSYjmNc8xKNyqBjpe9Ra0maA2xS9L86vzFoBFViwXjBON1w 7pUw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:message-id:date:to:from :delivered-to; bh=KLdvxfMLjKLqk+tRRyUq9d6ZU9EV1m8CTHy69xJWb9w=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=UWN2R7ao7VAFZ8B6v1bWgs+irLqwhI5Sso4I73aU1mmktzsLBS1TUpcTaWl+DHr13T vcChGR0MjSxYiWkEFXSUyZFve3aSW0rLx+O8TCEU5G5QXizB9eReoSfWx32xlb3rF0fX zcUWfpUNcUalRTebjieGydF+8blim9hB5m0RgrEg7uLbetfStNZUmE6oh+swV817R8gd R/GXiQS7zTj4G9I7gPhXPzBAnmxB5XiBZyCC+7m1lgdydw0eXydHnMaPRMdP8p58GuG6 G/YsExyVMAx+7LWmDuIJlnfBBmjySDNw+r+MuFUDwWfc2U/ge/nOw1WJsgCpPJUEFjEq 84rA== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id v30-20020a50a45e000000b0053f9ba36904si9370820edb.41.2023.11.10.07.44.05; Fri, 10 Nov 2023 07:44:05 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 4C78F68CC04; Fri, 10 Nov 2023 17:44:03 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 812FA68C83C for ; Fri, 10 Nov 2023 17:43:56 +0200 (EET) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 19469C0014 for ; Fri, 10 Nov 2023 17:43:56 +0200 (EET) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 10 Nov 2023 17:43:54 +0200 Message-ID: <20231110154355.5011-1-remi@remlab.net> X-Mailer: git-send-email 2.42.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCHv2 1/2] sws/rgb2rgb: rework R-V V YUY2 to 4:2:2 planar X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: Psn5DuUXXwfZ This saves three scratch registers and three instructions per line. The performance gains are mostly negligible. The main point is to free up registers for further rework. --- libswscale/riscv/rgb2rgb_rvv.S | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/libswscale/riscv/rgb2rgb_rvv.S b/libswscale/riscv/rgb2rgb_rvv.S index 671089c842..172f5918dc 100644 --- a/libswscale/riscv/rgb2rgb_rvv.S +++ b/libswscale/riscv/rgb2rgb_rvv.S @@ -127,31 +127,30 @@ func ff_deinterleave_bytes_rvv, zve32x endfunc .macro yuy2_to_i422p y_shift - addi a4, a4, 1 + slli t4, a4, 1 // pixel width -> (source) byte width lw t6, (sp) + sub a6, a6, a4 srai a4, a4, 1 // pixel width -> chroma width + sub a7, a7, a4 + sub t6, t6, t4 1: mv t4, a4 - mv t3, a3 - mv t0, a0 - mv t1, a1 - mv t2, a2 addi a5, a5, -1 2: vsetvli t5, t4, e8, m2, ta, ma - vlseg2e16.v v16, (t3) + vlseg2e16.v v16, (a3) sub t4, t4, t5 vnsrl.wi v24, v16, \y_shift // Y0 - sh2add t3, t5, t3 + sh2add a3, t5, a3 vnsrl.wi v26, v20, \y_shift // Y1 vnsrl.wi v28, v16, 8 - \y_shift // U vnsrl.wi v30, v20, 8 - \y_shift // V - vsseg2e8.v v24, (t0) - sh1add t0, t5, t0 - vse8.v v28, (t1) - add t1, t5, t1 - vse8.v v30, (t2) - add t2, t5, t2 + vsseg2e8.v v24, (a0) + sh1add a0, t5, a0 + vse8.v v28, (a1) + add a1, t5, a1 + vse8.v v30, (a2) + add a2, t5, a2 bnez t4, 2b add a3, a3, t6 From patchwork Fri Nov 10 15:43:55 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 44609 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a05:6a20:4f99:b0:181:818d:5e7f with SMTP id gh25csp344575pzb; Fri, 10 Nov 2023 07:44:15 -0800 (PST) X-Google-Smtp-Source: AGHT+IEc/vlJV4BqnaPlP1w+nKOtTSvo6lL54Jkw4NCvetgcAnVYb0tXpTeyDXVxJ9VH23+UTWVQ X-Received: by 2002:a17:906:ee81:b0:9dc:21c7:9ae5 with SMTP id wt1-20020a170906ee8100b009dc21c79ae5mr7236980ejb.26.1699631055150; Fri, 10 Nov 2023 07:44:15 -0800 (PST) ARC-Seal: i=1; a=rsa-sha256; t=1699631055; cv=none; d=google.com; s=arc-20160816; b=TQiWxVrfCitTIhVZJnlCguHDOnnW+5uarbVULta+CD412Hx7XCzPX9QmJ4ZenkMJKA eRVzoksbm5SGdK0YVBP/8vnIzAfeEVIN55MquvYQKMQLfGyV41UKmRXAAX5tFHFkoUdT zvbtjbg7nesx+sXtvttzqDAYa9ETxBnjywy7+ZYxiTvocvoK36dXBujAyASHuDhN9Kug YgK7SmIKzm4SykK8ATwWubZZb2CY2ItZChWjbVRkYRzakUASu61Z5K35lGHTuxYL++Xt Ba49DzMA0mQKWqUbpnk1tNhhqAXURtxpJqi1ceW7AQPGkhAQpB6C/VucuLf42nMmuLT1 e+JA== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:references:in-reply-to:message-id :date:to:from:delivered-to; bh=akONX2av+ZzxhqvflTsbJpCIddC8gBnvzO7/pIUkIO8=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=q0KCyn0lqK6UYaaHRFrUmkipYdkwHBWHtdvbs4xFLGd6ffYs1pX37+CpzZp4M7EITU wNhzV63sryY6/qDcmy2+4+5kEx6W3jdWjD7toSrUkiYl5oj/J0+2yVxj84w7EjyvhgvQ gwUOsQaVAonkyL+lENV1RSaNdNwrWrbNAi8/Qp3Amh+aZ9U2SiOtPNIk+dlIs8QFBpwU sCRjGMjGNePAe0yTUshiKEQNRHV14MczhwsfCC6LJgAgI1lZ8rKSM9IQmVUCqHQI7PRA iw9Ca3Jhanwmra/Z/zvZH9RqK3jTkii/JAjGPA7UxxkwI8bndVKGgTD7xSBDhhAc1Yfz ZkfQ== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id hv10-20020a17090760ca00b009e068673373si4777686ejc.94.2023.11.10.07.44.14; Fri, 10 Nov 2023 07:44:15 -0800 (PST) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 412E568CC0F; Fri, 10 Nov 2023 17:44:04 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id AB65668C83C for ; Fri, 10 Nov 2023 17:43:56 +0200 (EET) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 47CB2C0017 for ; Fri, 10 Nov 2023 17:43:56 +0200 (EET) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Fri, 10 Nov 2023 17:43:55 +0200 Message-ID: <20231110154355.5011-2-remi@remlab.net> X-Mailer: git-send-email 2.42.0 In-Reply-To: <20231110154355.5011-1-remi@remlab.net> References: <20231110154355.5011-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCHv2 2/2] sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: OJh4ye7tTFDT In my personal opinion, we should not need to support unaligned YUY2 pixel maps. They should always be aligned to at least 32 bits, and the current code assumes just 16 bits. However checkasm does test for unaligned input bitmaps. QEMU accepts it, but real hardware dose not. In this particular case, we can at the same time improve performance and handle unaligned inputs, so do just that. uyvytoyuv422_c: 104379.0 uyvytoyuv422_c: 104060.0 uyvytoyuv422_rvv_i32: 25284.0 (before) uyvytoyuv422_rvv_i32: 19303.2 (after) --- libswscale/riscv/rgb2rgb.c | 6 +++-- libswscale/riscv/rgb2rgb_rvv.S | 47 ++++++++++++++++++---------------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/libswscale/riscv/rgb2rgb.c b/libswscale/riscv/rgb2rgb.c index 565f0b77f1..4fa1f5afd9 100644 --- a/libswscale/riscv/rgb2rgb.c +++ b/libswscale/riscv/rgb2rgb.c @@ -55,8 +55,10 @@ av_cold void rgb2rgb_init_riscv(void) shuffle_bytes_1230 = ff_shuffle_bytes_1230_rvv; shuffle_bytes_3012 = ff_shuffle_bytes_3012_rvv; interleaveBytes = ff_interleave_bytes_rvv; - uyvytoyuv422 = ff_uyvytoyuv422_rvv; - yuyvtoyuv422 = ff_yuyvtoyuv422_rvv; + if (flags & AV_CPU_FLAG_RVB_BASIC) { + uyvytoyuv422 = ff_uyvytoyuv422_rvv; + yuyvtoyuv422 = ff_yuyvtoyuv422_rvv; + } } #endif } diff --git a/libswscale/riscv/rgb2rgb_rvv.S b/libswscale/riscv/rgb2rgb_rvv.S index 172f5918dc..21e30ab8bb 100644 --- a/libswscale/riscv/rgb2rgb_rvv.S +++ b/libswscale/riscv/rgb2rgb_rvv.S @@ -126,32 +126,35 @@ func ff_deinterleave_bytes_rvv, zve32x ret endfunc -.macro yuy2_to_i422p y_shift - slli t4, a4, 1 // pixel width -> (source) byte width +.macro yuy2_to_i422p luma, chroma + srai t4, a4, 1 // pixel width -> chroma width lw t6, (sp) + slli t5, a4, 1 // pixel width -> (source) byte width sub a6, a6, a4 - srai a4, a4, 1 // pixel width -> chroma width - sub a7, a7, a4 - sub t6, t6, t4 + sub a7, a7, t4 + sub t6, t6, t5 + vsetvli t2, zero, e8, m4, ta, ma 1: mv t4, a4 addi a5, a5, -1 2: - vsetvli t5, t4, e8, m2, ta, ma - vlseg2e16.v v16, (a3) - sub t4, t4, t5 - vnsrl.wi v24, v16, \y_shift // Y0 - sh2add a3, t5, a3 - vnsrl.wi v26, v20, \y_shift // Y1 - vnsrl.wi v28, v16, 8 - \y_shift // U - vnsrl.wi v30, v20, 8 - \y_shift // V - vsseg2e8.v v24, (a0) - sh1add a0, t5, a0 - vse8.v v28, (a1) - add a1, t5, a1 - vse8.v v30, (a2) - add a2, t5, a2 - bnez t4, 2b + min t0, t2, t4 // ensure even VL on penultimate iteration + vsetvli t0, t0, e8, m4, ta, ma + vlseg2e8.v v16, (a3) + srli t1, t0, 1 + vsetvli zero, t1, e8, m2, ta, ma + vnsrl.wi v24, \chroma, 0 // U + sub t4, t4, t0 + vnsrl.wi v28, \chroma, 8 // V + sh1add a3, t0, a3 + vse8.v v24, (a1) + add a1, t1, a1 + vse8.v v28, (a2) + add a2, t1, a2 + vsetvli zero, t0, e8, m4, ta, ma + vse8.v \luma, (a0) + add a0, t0, a0 + bnez t4, 2b add a3, a3, t6 add a0, a0, a6 @@ -163,9 +166,9 @@ endfunc .endm func ff_uyvytoyuv422_rvv, zve32x - yuy2_to_i422p 8 + yuy2_to_i422p v20, v16 endfunc func ff_yuyvtoyuv422_rvv, zve32x - yuy2_to_i422p 0 + yuy2_to_i422p v16, v20 endfunc