From patchwork Thu Jul 18 19:35:42 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50633 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:a742:0:b0:482:c625:d099 with SMTP id f2csp198455vqm; Thu, 18 Jul 2024 12:36:09 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCUmoFe8mUEl+bnRcWa6TK1MQQpHIF0XmE6iBrDMXh+ilfxuMcncwt8PxGRUre5tA1jVONUWheV2GmK4CIpFAceXMxIiJJaCSd5X4g== X-Google-Smtp-Source: AGHT+IHY1/cz0KpfA8YbiweKR4knv2ZZOZIBkLXAU2wAyZIwaUlW6nctrLj8NBRpaJiq2edTd6dk X-Received: by 2002:a17:906:d286:b0:a77:eb34:3b42 with SMTP id a640c23a62f3a-a7a011c1472mr478487966b.12.1721331369337; Thu, 18 Jul 2024 12:36:09 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1721331369; cv=none; d=google.com; s=arc-20160816; b=YEs5a3ej9TcRNt0jm8uZjr7zqLXxugnt/RlTamorxlvhGOERw1GTGXPjhdZR5i68E2 TBg+A3BZ7vYmiuWf7PyvNFQUksbyzktRy52xLcv+DHqt6lJwI4YyBAYKON1LNWRxY2X8 rggExOWGsKaLrZzfjAiq+NtjSM1FsdvYnxMoByfRYnaMg8VV2vmjUQPTHtIkbl6hWcCg Ap8IqiAiDJtNKlIGbR3ydRLaiZxxncuGB8KzWdp/dD+CS2LqNaNz8UlElE5j8iv8wklW rljJNcrEGyofvI9je2In/ixOKhiXpeMI/sRWotYeMj3ophSvYID7qnIV6vdgFbFUp+Pa EOQQ== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:message-id:date:to:from :delivered-to; bh=5r3kcdSI0kQw3KGNrSCpIQINc3B9skRqFp/hL/wCOTU=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=k504B5gevzcyvujdn+wiDYwttOScJK7Et/JUSvZwczxKBpn2q7/WZV8zWiUrH/nrtw VU3bcyhp8w0V8YZz6/JrJtxrne4veOTPxII+OHk5ncYcZsgOMny/HPNaz5DqGIZrMGc7 aklR2/BRR2B53OJL1X3No7KX9LEoYhNvpla4H+BwaMxW7lcNQTkiPqk9rnD0Llr8zKw/ HSidhdd9U0xUxp0vAeZVEpLjMJ6CXOkQXqPAgwRAzJUJv5KnDLSEIfrZlxSeEGo0U5YR zFU0ZR+no0MFxss4oLHn+3YKv9u5Q3uxHphW1demTL4VrLS596Qvh9iBZjYHJD63/llx EmpA==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id a640c23a62f3a-a7a306bfeebsi8618166b.108.2024.07.18.12.36.09; Thu, 18 Jul 2024 12:36:09 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 1C41668DB00; Thu, 18 Jul 2024 22:35:56 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 20AAA68D6BC for ; Thu, 18 Jul 2024 22:35:48 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 1F2D4C0090 for ; Thu, 18 Jul 2024 22:35:47 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Thu, 18 Jul 2024 22:35:42 +0300 Message-ID: <20240718193546.18939-1-remi@remlab.net> X-Mailer: git-send-email 2.45.2 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 1/5] lavc/h264dsp: factor some mostly identical R-V V code X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: CSh3jjbDzxOP --- libavcodec/riscv/h264idct_rvv.S | 108 ++++++-------------------------- 1 file changed, 18 insertions(+), 90 deletions(-) diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index 48de65ec0b..505f491308 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -418,8 +418,8 @@ const ff_h264_scan8 endconst #if (__riscv_xlen == 64) -.irp depth, 8, 16 -func ff_h264_idct_add16_\depth\()_rvv, zve32x +.macro idct4_adds type, depth +func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x csrwi vxrm, 0 addi sp, sp, -96 lla t0, ff_h264_scan8 @@ -455,9 +455,13 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x .endif vmsne.vi v1, v16, 0 vsetvli zero, zero, e8, m1, ta, ma +.ifc \type, 16 vmseq.vi v2, v12, 1 +.endif vmsne.vi v0, v12, 0 +.ifc \type, 16 vmand.mm v1, v1, v2 +.endif vsetvli zero, zero, e16, m2, ta, ma vmv.x.s s2, v0 vmv.x.s s3, v1 @@ -470,96 +474,9 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x andi t0, s2, 1 addi s1, s1, -1 srli s2, s2, 1 +.ifc \type, 16 beqz t0, 3f # if (nnz) - lw t2, (s5) # block_offset[i] - andi t1, s3, 1 - mv a1, s6 - mv a2, s7 - add a0, s4, t2 -.if \depth > 8 - mv a5, s8 .endif - bnez t1, 2f # if (nnz == 1 && block[i * 16]) - jal .Lidct_add4_\depth\()_rvv - j 3f -2: -.if \depth == 8 - call ff_h264_idct_dc_add_\depth\()_c -.else - jalr s9 -.endif -3: - srli s3, s3, 1 - addi s5, s5, 4 - addi s6, s6, 16 * 2 * (\depth / 8) - bnez s1, 1b - -.if \depth > 8 - ld s9, 80(sp) - ld s8, 72(sp) -.endif - ld s7, 64(sp) - ld s6, 56(sp) - ld s5, 48(sp) - ld s4, 40(sp) - ld s3, 32(sp) - ld s2, 24(sp) - ld s1, 16(sp) - ld ra, 8(sp) - ld s0, 0(sp) - addi sp, sp, 96 - ret -endfunc - -func ff_h264_idct_add16intra_\depth\()_rvv, zve32x - csrwi vxrm, 0 - addi sp, sp, -96 - lla t0, ff_h264_scan8 - sd s0, (sp) - li t1, 32 * (\depth / 8) - mv s0, sp - sd ra, 8(sp) - sd s1, 16(sp) - sd s2, 24(sp) - sd s3, 32(sp) - sd s4, 40(sp) - sd s5, 48(sp) - sd s6, 56(sp) - sd s7, 64(sp) -.if \depth > 8 - sd s8, 72(sp) - sd s9, 80(sp) - mv s8, a5 - mv s9, a6 -.endif - vsetivli zero, 16, e8, m1, ta, ma - vle8.v v8, (t0) -.if \depth == 8 - vlse16.v v16, (a2), t1 -.else - vlse32.v v16, (a2), t1 -.endif - vluxei8.v v12, (a4), v8 -.if \depth == 8 - vsetvli zero, zero, e16, m2, ta, ma -.else - vsetvli zero, zero, e32, m4, ta, ma -.endif - vmsne.vi v1, v16, 0 - vsetvli zero, zero, e8, m1, ta, ma - vmsne.vi v0, v12, 0 - vsetvli zero, zero, e16, m2, ta, ma - vmv.x.s s2, v0 - vmv.x.s s3, v1 - li s1, 16 - mv s4, a0 - mv s5, a1 - mv s6, a2 - mv s7, a3 -1: - andi t0, s2, 1 - addi s1, s1, -1 - srli s2, s2, 1 lw t2, (s5) # block_offset[i] andi t1, s3, 1 mv a1, s6 @@ -568,11 +485,17 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x .if \depth > 8 mv a5, s8 .endif +.ifc \type, 16 + bnez t1, 2f # if (nnz == 1 && block[i * 16]) +.else beqz t0, 2f # if (nnzc[scan8[i]]) +.endif jal .Lidct_add4_\depth\()_rvv j 3f 2: +.ifnc \type, 16 beqz t1, 3f # if (block[i * 16]) +.endif .if \depth == 8 call ff_h264_idct_dc_add_\depth\()_c .else @@ -600,6 +523,11 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x addi sp, sp, 96 ret endfunc +.endm + +.irp depth, 8, 16 +idct4_adds 16, \depth +idct4_adds 16intra, \depth func ff_h264_idct8_add4_\depth\()_rvv, zve32x csrwi vxrm, 0 From patchwork Thu Jul 18 19:35:43 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50634 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:a742:0:b0:482:c625:d099 with SMTP id f2csp198550vqm; Thu, 18 Jul 2024 12:36:18 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCUzkKI6MBVmQebj95tcEZZhF4COFlsyOTXBQn4pCtEHm3AYO4gqW4T0Hz2QNT4KnghN2LHZaEpAv86rzSqdq4KHkOrxxNXc4K0Kvg== X-Google-Smtp-Source: AGHT+IEfVYN59oWrZP7t0/HGsgP6CRNZkpschNs6qdsUPzJHt9pQ3wgoEuDrTw6T+FAYfD38rR/i X-Received: by 2002:ac2:4c55:0:b0:52c:cd77:fe03 with SMTP id 2adb3069b0e04-52ee53b725emr5415024e87.14.1721331378555; Thu, 18 Jul 2024 12:36:18 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1721331378; cv=none; d=google.com; s=arc-20160816; b=V24MyGx5RtT36rQd/xREkMXCeDEUXZYkrXUmRncNtmgOvgwRtw0xL6iH8PRsV2BVZr RR76CjhhXtJla4MiF0js3Od4ltgLErp9ZfKugGbg3ZNHRTw7uTC4vy3Ec35VZq5Yv516 IsZI28Ao94Xg0gkMJL3LadzfG1qTHQfKSmX5ZyZSHyBR3DTnNmqkBdQUz9yBUDEYkNsq CeL+IAjv9WZmMC17nyc+I6JfwZEf4jjypNn4oEifzxWgssbQs3qvMeh13ZnHgEfNZkwt nASM/z9O8lDkf0vsAwFluBS8l7k2+YfW0oT/BWV94lKbDo9PkD9N6TiGAcCjZVEbtacb I+vw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:references:in-reply-to:message-id :date:to:from:delivered-to; bh=TzpJ6Es9LoDBPSiPhq/h4JjaWM8A6wgLpbLWd4BgeQE=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=e6LhptWmckdKFYl9zTIVB1tH6tanboz8cxlFDSAEDBTC7wgY6vm2TsX3EUvTIxwY4J S864mBMZyzigMkNtYorhHZPJ29LspObZbM/1f7IOnFyJTGARDsRnWTIgqCCv1CW+pofO oVXcA1Z8t3DXnek2yIqhrgVet9bYY0ifgjLwT6ypT8kvEiv7LKwnIOpES79FrDXnX3+y 9CWp4lArU0wLjOWDVW1ktrWuWyWmRKUg3G1HC9IhPLPjZ5wTbWJp/xBtp8HwqL/cBehR 60+VMI+9mMMurtZGuNA7w13JhhCMQ+NVUVPRV7WaV6LPNk/Pnv315UKpZ2eIMXfQB0A1 55Sw==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id 2adb3069b0e04-52ef07d854esi359671e87.561.2024.07.18.12.36.18; Thu, 18 Jul 2024 12:36:18 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3983268D8A2; Thu, 18 Jul 2024 22:35:57 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 26C7B68D8EC for ; Thu, 18 Jul 2024 22:35:48 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 50F92C016F for ; Thu, 18 Jul 2024 22:35:47 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Thu, 18 Jul 2024 22:35:43 +0300 Message-ID: <20240718193546.18939-2-remi@remlab.net> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240718193546.18939-1-remi@remlab.net> References: <20240718193546.18939-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: bvZhXQ3eTC9Z From: "J. Dekker" No functional changes. This just moves the assembler so that it can be referenced by other functions in h264idct_rvv.S with local jumps. Edited-by: Rémi Denis-Courmont --- libavcodec/riscv/h264dsp_rvv.S | 103 ------------------------------- libavcodec/riscv/h264idct_rvv.S | 105 ++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 103 deletions(-) diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S index 5c70709cf2..ed6a16a9c4 100644 --- a/libavcodec/riscv/h264dsp_rvv.S +++ b/libavcodec/riscv/h264dsp_rvv.S @@ -1,7 +1,6 @@ /* * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2024 J. Dekker * Copyright © 2024 Rémi Denis-Courmont. * * Redistribution and use in source and binary forms, with or without @@ -326,105 +325,3 @@ func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x vssseg6e8.v v8, (a0), a1 ret endfunc - -.macro idct_dc_add8 width -func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba -.if \width == 8 - vsetivli zero, \width, e16, m1, ta, ma -.else - vsetivli zero, \width, e16, mf2, ta, ma -.endif - lh a3, 0(a1) - addi a3, a3, 32 - srai a3, a3, 6 - sh zero, 0(a1) -.if \width == 8 - vlse64.v v24, (a0), a2 - vsetvli t0, zero, e16, m8, ta, ma -.else - vlse32.v v24, (a0), a2 - vsetvli t0, zero, e16, m4, ta, ma -.endif - vzext.vf2 v0, v24 - vadd.vx v0, v0, a3 - vmax.vx v0, v0, zero -.if \width == 8 - vsetvli zero, zero, e8, m4, ta, ma -.else - vsetvli zero, zero, e8, m2, ta, ma -.endif - vnclipu.wi v24, v0, 0 - vsetivli zero, \width, e8, m1, ta, ma -.if \width == 8 - vsse64.v v24, (a0), a2 -.else - vsse32.v v24, (a0), a2 -.endif - ret -endfunc -.endm - -idct_dc_add8 4 -idct_dc_add8 8 - -.macro idct_dc_add width -func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba - vsetivli zero, \width, e16, m1, ta, ma - lw a3, 0(a1) - addi a3, a3, 32 - srai a3, a3, 6 - sw zero, 0(a1) - add t4, a0, a2 - sh1add t5, a2, a0 - sh1add t6, a2, t4 -.if \width == 8 - sh2add t0, a2, a0 - sh2add t1, a2, t4 - sh2add t2, a2, t5 - sh2add t3, a2, t6 -.endif - vle16.v v0, (a0) - vle16.v v1, (t4) - vle16.v v2, (t5) - vle16.v v3, (t6) -.if \width == 8 - vle16.v v4, (t0) - vle16.v v5, (t1) - vle16.v v6, (t2) - vle16.v v7, (t3) - vsetvli a6, zero, e16, m8, ta, ma -.else - vsetvli a6, zero, e16, m4, ta, ma -.endif - vadd.vx v0, v0, a3 - vmax.vx v0, v0, zero - vmin.vx v0, v0, a5 - vsetivli zero, \width, e16, m1, ta, ma - vse16.v v0, (a0) - vse16.v v1, (t4) - vse16.v v2, (t5) - vse16.v v3, (t6) -.if \width == 8 - vse16.v v4, (t0) - vse16.v v5, (t1) - vse16.v v6, (t2) - vse16.v v7, (t3) -.endif - ret -endfunc -.endm - -idct_dc_add 4 -idct_dc_add 8 - -.irp depth,9,10,12,14 -func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x - li a5, (1 << \depth) - 1 - j ff_h264_idct4_dc_add_16_rvv -endfunc - -func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x - li a5, (1 << \depth) - 1 - j ff_h264_idct8_dc_add_16_rvv -endfunc -.endr diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index 505f491308..37b27fc92a 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -1,4 +1,7 @@ /* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 J. Dekker * Copyright © 2024 Rémi Denis-Courmont. * * Redistribution and use in source and binary forms, with or without @@ -412,6 +415,108 @@ func ff_h264_idct8_add_\depth\()_rvv, zve32x endfunc .endr +.macro idct_dc_add8 width +func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba +.if \width == 8 + vsetivli zero, \width, e16, m1, ta, ma +.else + vsetivli zero, \width, e16, mf2, ta, ma +.endif + lh a3, 0(a1) + addi a3, a3, 32 + srai a3, a3, 6 + sh zero, 0(a1) +.if \width == 8 + vlse64.v v24, (a0), a2 + vsetvli t0, zero, e16, m8, ta, ma +.else + vlse32.v v24, (a0), a2 + vsetvli t0, zero, e16, m4, ta, ma +.endif + vzext.vf2 v0, v24 + vadd.vx v0, v0, a3 + vmax.vx v0, v0, zero +.if \width == 8 + vsetvli zero, zero, e8, m4, ta, ma +.else + vsetvli zero, zero, e8, m2, ta, ma +.endif + vnclipu.wi v24, v0, 0 + vsetivli zero, \width, e8, m1, ta, ma +.if \width == 8 + vsse64.v v24, (a0), a2 +.else + vsse32.v v24, (a0), a2 +.endif + ret +endfunc +.endm + +idct_dc_add8 4 +idct_dc_add8 8 + +.macro idct_dc_add width +func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba + vsetivli zero, \width, e16, m1, ta, ma + lw a3, 0(a1) + addi a3, a3, 32 + srai a3, a3, 6 + sw zero, 0(a1) + add t4, a0, a2 + sh1add t5, a2, a0 + sh1add t6, a2, t4 +.if \width == 8 + sh2add t0, a2, a0 + sh2add t1, a2, t4 + sh2add t2, a2, t5 + sh2add t3, a2, t6 +.endif + vle16.v v0, (a0) + vle16.v v1, (t4) + vle16.v v2, (t5) + vle16.v v3, (t6) +.if \width == 8 + vle16.v v4, (t0) + vle16.v v5, (t1) + vle16.v v6, (t2) + vle16.v v7, (t3) + vsetvli a6, zero, e16, m8, ta, ma +.else + vsetvli a6, zero, e16, m4, ta, ma +.endif + vadd.vx v0, v0, a3 + vmax.vx v0, v0, zero + vmin.vx v0, v0, a5 + vsetivli zero, \width, e16, m1, ta, ma + vse16.v v0, (a0) + vse16.v v1, (t4) + vse16.v v2, (t5) + vse16.v v3, (t6) +.if \width == 8 + vse16.v v4, (t0) + vse16.v v5, (t1) + vse16.v v6, (t2) + vse16.v v7, (t3) +.endif + ret +endfunc +.endm + +idct_dc_add 4 +idct_dc_add 8 + +.irp depth,9,10,12,14 +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x + li a5, (1 << \depth) - 1 + j ff_h264_idct4_dc_add_16_rvv +endfunc + +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x + li a5, (1 << \depth) - 1 + j ff_h264_idct8_dc_add_16_rvv +endfunc +.endr + const ff_h264_scan8 .byte 014, 015, 024, 025, 016, 017, 026, 027 .byte 034, 035, 044, 045, 036, 037, 046, 047 From patchwork Thu Jul 18 19:35:44 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50632 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:a742:0:b0:482:c625:d099 with SMTP id f2csp198351vqm; Thu, 18 Jul 2024 12:35:59 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCW80X9jYj6tjkNy7K3xP6AXOpaQFJCNkfI1QuMoCrDFH0OcXMuWijSFoR9344SsW8NpICEzBQYF+3NSjtM7Wjlob9Gls6623cEmDw== X-Google-Smtp-Source: AGHT+IEf9fpmwp/9LPfOFi1EO0cAbN3VnHFa4aehhKP8uBSO55Gutr3PyKT0lEZ3i5u0YbiKGRll X-Received: by 2002:a05:6512:2252:b0:52c:e03d:fa33 with SMTP id 2adb3069b0e04-52ee544caddmr4374550e87.62.1721331359012; Thu, 18 Jul 2024 12:35:59 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1721331358; cv=none; d=google.com; s=arc-20160816; b=cui+fjSY8gpkrUmZXFT2RREDIvAh5lfe34UAiBP2VTLCNSiApT7uAUodQB2jfanuIB tJWHwg0xI/5zNGTCn28Zcvdvf0IWeNn6tGOt3oERGuO+WwECGdBWdFRGsksVTI0BYInX tUmZq/9JxshCso49A8EuSZ7z1JTBInts4a5s3ZjLbAMyBEWSPYg6pM6ipu3CTyamuovN PnksoPPIPQxXowyiNj1kMs8oBFPJ8Vh+2+s6GSRDA6fDqRQ4xKR/FiSTwS8/oMvsoZnp 9lgxg79em2V6MzrlcGIlcuUB4bspTjJ3jsdqYT6S7U6K+ApG4+wYZ0Lw6rUpPsm02XlN b+9A== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:references:in-reply-to:message-id :date:to:from:delivered-to; bh=oJZmChKOHPkYpy0IcOiAtbu8SFwWY2VHeA5qr1Sz6zE=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=h/vzYahhVfR3V5L/U1x2+6K/Hop4wpZRerAEhF9agrHESeZDXQQwTtpRtPUYE8dixN PV4ceKJxbTd/q/L1lMESlqOlFhkmDYT9OoBkc9cJU+XVOiN+2o+5nTOBHuQnaOKJXobe MF8wdU2qxpv1rXFyusB5G4OvJFAFjn+Or7J3gjQqvKLKI9GkoqwGdXH4uaMj49aOnYcT rD8B4iKTp2oWVrFPt/LcSZhXYpCPwzANVg9V4BgYlpUK+uNl74aRSNWas7oNeJ4UTDVZ x0macjRW+ufh06wzH8G0fjwv1tJupxnkBzB1evQ70GdgPpMq4YfU+K9AM8M7OE5IgmGa 1a+Q==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id 4fb4d7f45d1cf-5a2d052863bsi93628a12.101.2024.07.18.12.35.58; Thu, 18 Jul 2024 12:35:58 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E0EAB68DAF2; Thu, 18 Jul 2024 22:35:54 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 23B9A68D85C for ; Thu, 18 Jul 2024 22:35:48 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 81B35C01F0 for ; Thu, 18 Jul 2024 22:35:47 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Thu, 18 Jul 2024 22:35:44 +0300 Message-ID: <20240718193546.18939-3-remi@remlab.net> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240718193546.18939-1-remi@remlab.net> References: <20240718193546.18939-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: GcJWPTgG897Q T-Head C908 (cycles): h264_idct4_dc_add_8bpp_c: 94.7 h264_idct4_dc_add_8bpp_rvv_i32: 55.0 (before) h264_idct4_dc_add_8bpp_rvv_i32: 34.5 (after) h264_idct4_dc_add_9bpp_c: 94.7 h264_idct4_dc_add_9bpp_rvv_i32: 43.5 (before) h264_idct4_dc_add_9bpp_rvv_i32: 38.2 (after) h264_idct4_dc_add_10bpp_c: 94.7 h264_idct4_dc_add_10bpp_rvv_i32: 43.5 (before) h264_idct4_dc_add_10bpp_rvv_i32: 38.2 (after) h264_idct4_dc_add_12bpp_c: 94.7 h264_idct4_dc_add_12bpp_rvv_i32: 43.7 (before) h264_idct4_dc_add_12bpp_rvv_i32: 38.5 (after) h264_idct4_dc_add_14bpp_c: 94.7 h264_idct4_dc_add_14bpp_rvv_i32: 43.7 (before) h264_idct4_dc_add_14bpp_rvv_i32: 38.5 (after) --- libavcodec/riscv/h264idct_rvv.S | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index 37b27fc92a..2648e06aeb 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -416,22 +416,23 @@ endfunc .endr .macro idct_dc_add8 width -func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba +func ff_h264_idct\width\()_dc_add_8_rvv, zve64x .if \width == 8 - vsetivli zero, \width, e16, m1, ta, ma + vsetivli zero, \width, e8, mf2, ta, ma .else - vsetivli zero, \width, e16, mf2, ta, ma + vsetivli zero, \width, e8, mf4, ta, ma .endif lh a3, 0(a1) addi a3, a3, 32 srai a3, a3, 6 sh zero, 0(a1) .if \width == 8 + li a6, \width * \width vlse64.v v24, (a0), a2 - vsetvli t0, zero, e16, m8, ta, ma + vsetvli zero, a6, e16, m8, ta, ma .else vlse32.v v24, (a0), a2 - vsetvli t0, zero, e16, m4, ta, ma + vsetivli zero, \width * \width, e16, m2, ta, ma .endif vzext.vf2 v0, v24 vadd.vx v0, v0, a3 @@ -439,13 +440,14 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba .if \width == 8 vsetvli zero, zero, e8, m4, ta, ma .else - vsetvli zero, zero, e8, m2, ta, ma + vsetvli zero, zero, e8, m1, ta, ma .endif vnclipu.wi v24, v0, 0 - vsetivli zero, \width, e8, m1, ta, ma .if \width == 8 + vsetivli zero, \width, e8, mf2, ta, ma vsse64.v v24, (a0), a2 .else + vsetivli zero, \width, e8, mf4, ta, ma vsse32.v v24, (a0), a2 .endif ret @@ -457,7 +459,11 @@ idct_dc_add8 8 .macro idct_dc_add width func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba +.if \width == 8 vsetivli zero, \width, e16, m1, ta, ma +.else + vsetivli zero, \width, e16, mf2, ta, ma +.endif lw a3, 0(a1) addi a3, a3, 32 srai a3, a3, 6 @@ -487,7 +493,11 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba vadd.vx v0, v0, a3 vmax.vx v0, v0, zero vmin.vx v0, v0, a5 +.if \width == 8 vsetivli zero, \width, e16, m1, ta, ma +.else + vsetivli zero, \width, e16, mf2, ta, ma +.endif vse16.v v0, (a0) vse16.v v1, (t4) vse16.v v2, (t5) From patchwork Thu Jul 18 19:35:45 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50635 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:a742:0:b0:482:c625:d099 with SMTP id f2csp198614vqm; Thu, 18 Jul 2024 12:36:27 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCVH5mXmJ2nbbmD+QjxDMaOa/RNYx3C0y5E7oTF8boF1RPQleMwgWL6SG8ovgniIU5uL5DOTcfh3ZQwsASvgZY5sjbBc7EMHoBRJbA== X-Google-Smtp-Source: AGHT+IENsm35/H+PUiMD+pAQcPjLqMsZn1cX/rrsWwkQ2Mym6Q34//4H8yxk3fLGl3D3KeEPxhPu X-Received: by 2002:a17:906:b74d:b0:a72:8c15:c73e with SMTP id a640c23a62f3a-a7a0130e696mr417172466b.55.1721331386745; Thu, 18 Jul 2024 12:36:26 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1721331386; cv=none; d=google.com; s=arc-20160816; b=BuerUssW3CyM/SKxtJ0tJ5OgQExNCosmUWYitc588eWywdyYfyONgQ8nVvMdkDWHX+ WdyvlCEwmebVaBd8D3Bg7O8oRERFGgxRB9susILpQeNvIiY5Lfd7HbQBHdebHKEjoZgF feQlntFucPmkSi3JcKID9rn/Xo4WQFd3VHmV28aunT3PS2ZX6QlcZBRoYbiuSo2A2iWk 1yNMBiAMtUB46wnfFq4R53mqFPt7DLht+pkZxcKYw09DBf/ZuidzHzb5XmPnLV2RItnm POXAkfH4EvYi14NmtQhVaZw6IsGtVMbWMNlPeQXBihuZ1P5k39OppC+Gfs1PI0091n3h /nxg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:references:in-reply-to:message-id :date:to:from:delivered-to; bh=XY3/wxk7MMgN6yWa+TG0tY3+2VxqCEnN4vBWBVgkum4=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=bpLSMgvd5z+RRatYWJqspeiUzPzlshxmvAU4gY6tmekHxrnzm/jcyFOt00+EFwoMxi D7cJ3ZyhIQedEDlEkBPq77QSHY2ayPY3CnHYrEdIUDPtwrGrcuYOdiiUKF9S0uK6VMWh 3o7Tzn6vuRrmzbQS61hQER03UYhpMaMXfQFatbbQjRLFFNk8+dNmxEDtkKMAAL3/mPdq LdOjA2xiiDx59vVwlzPDUFipxpd1K+LY95ztXe+5Xre7soazM3h1DCCJtAxAUOUuFS2W esS7p76SlPKORRmdkAa2IVJP84k+R3Iq6Dfpb9bgLVfwwgZAw2wsLUNqEVB/wwA2phqB ddog==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id a640c23a62f3a-a7a3077914csi8037666b.302.2024.07.18.12.36.26; Thu, 18 Jul 2024 12:36:26 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 7ADC168DA86; Thu, 18 Jul 2024 22:35:58 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 310A568DA1E for ; Thu, 18 Jul 2024 22:35:48 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id B2ADCC0233 for ; Thu, 18 Jul 2024 22:35:47 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Thu, 18 Jul 2024 22:35:45 +0300 Message-ID: <20240718193546.18939-4-remi@remlab.net> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240718193546.18939-1-remi@remlab.net> References: <20240718193546.18939-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: 4hAZPK//xUDI This reuses the DC bypass functions from the multiple IDCT functions, to leverage vector code. As an added bonus, the caller functions can now rely on the callee functions to preserve their parameters, thus cutting down on stack spills. --- libavcodec/riscv/h264idct_rvv.S | 76 +++++++-------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index 2648e06aeb..c42db6ef29 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -536,7 +536,7 @@ endconst .macro idct4_adds type, depth func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x csrwi vxrm, 0 - addi sp, sp, -96 + addi sp, sp, -64 lla t0, ff_h264_scan8 sd s0, (sp) li t1, 32 * (\depth / 8) @@ -547,14 +547,6 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x sd s3, 32(sp) sd s4, 40(sp) sd s5, 48(sp) - sd s6, 56(sp) - sd s7, 64(sp) -.if \depth > 8 - sd s8, 72(sp) - sd s9, 80(sp) - mv s8, a5 - mv s9, a6 -.endif vsetivli zero, 16, e8, m1, ta, ma vle8.v v8, (t0) .if \depth == 8 @@ -583,8 +575,8 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x li s1, 16 mv s4, a0 mv s5, a1 - mv s6, a2 - mv s7, a3 + mv a1, a2 + mv a2, a3 1: andi t0, s2, 1 addi s1, s1, -1 @@ -594,12 +586,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x .endif lw t2, (s5) # block_offset[i] andi t1, s3, 1 - mv a1, s6 - mv a2, s7 add a0, s4, t2 -.if \depth > 8 - mv a5, s8 -.endif .ifc \type, 16 bnez t1, 2f # if (nnz == 1 && block[i * 16]) .else @@ -611,23 +598,13 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x .ifnc \type, 16 beqz t1, 3f # if (block[i * 16]) .endif -.if \depth == 8 - call ff_h264_idct_dc_add_\depth\()_c -.else - jalr s9 -.endif + jal ff_h264_idct4_dc_add_\depth\()_rvv 3: srli s3, s3, 1 addi s5, s5, 4 - addi s6, s6, 16 * 2 * (\depth / 8) + addi a1, a1, 16 * 2 * (\depth / 8) bnez s1, 1b -.if \depth > 8 - ld s9, 80(sp) - ld s8, 72(sp) -.endif - ld s7, 64(sp) - ld s6, 56(sp) ld s5, 48(sp) ld s4, 40(sp) ld s3, 32(sp) @@ -635,7 +612,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x ld s1, 16(sp) ld ra, 8(sp) ld s0, 0(sp) - addi sp, sp, 96 + addi sp, sp, 64 ret endfunc .endm @@ -646,7 +623,7 @@ idct4_adds 16intra, \depth func ff_h264_idct8_add4_\depth\()_rvv, zve32x csrwi vxrm, 0 - addi sp, sp, -96 + addi sp, sp, -64 lla t0, ff_h264_scan8 sd s0, (sp) li t1, 4 * 32 * (\depth / 8) @@ -658,14 +635,6 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x sd s3, 32(sp) sd s4, 40(sp) sd s5, 48(sp) - sd s6, 56(sp) - sd s7, 64(sp) -.if \depth > 8 - sd s8, 72(sp) - sd s9, 80(sp) - mv s8, a5 - mv s9, a6 -.endif vsetivli zero, 4, e8, mf4, ta, ma vlse8.v v8, (t0), t2 .if \depth == 8 @@ -689,8 +658,8 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x li s1, 4 mv s4, a0 mv s5, a1 - mv s6, a2 - mv s7, a3 + mv a1, a2 + mv a2, a3 1: andi t0, s2, 1 addi s1, s1, -1 @@ -698,33 +667,23 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x beqz t0, 3f # if (nnz) lw t2, (s5) # block_offset[i] andi t1, s3, 1 - mv a1, s6 - mv a2, s7 add a0, s4, t2 -.if \depth > 8 - mv a5, s8 -.endif bnez t1, 2f # if (nnz == 1 && block[i * 16]) jal .Lidct8_add_\depth\()_rvv - j 3f -2: .if \depth == 8 - call ff_h264_idct8_dc_add_\depth\()_c + j 3f .else - jalr s9 + j 4f # idct8_add_16 updates a1 .endif +2: + jal ff_h264_idct8_dc_add_\depth\()_rvv 3: + addi a1, a1, 4 * 16 * 2 * (\depth / 8) +4: srli s3, s3, 1 addi s5, s5, 4 * 4 - addi s6, s6, 4 * 16 * 2 * (\depth / 8) bnez s1, 1b -.if \depth > 8 - ld s9, 80(sp) - ld s8, 72(sp) -.endif - ld s7, 64(sp) - ld s6, 56(sp) ld s5, 48(sp) ld s4, 40(sp) ld s3, 32(sp) @@ -732,7 +691,7 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x ld s1, 16(sp) ld ra, 8(sp) ld s0, 0(sp) - addi sp, sp, 96 + addi sp, sp, 64 ret endfunc .endr @@ -740,19 +699,16 @@ endfunc .irp depth, 9, 10, 12, 14 func ff_h264_idct_add16_\depth\()_rvv, zve32x li a5, (1 << \depth) - 1 - lla a6, ff_h264_idct_dc_add_\depth\()_c j ff_h264_idct_add16_16_rvv endfunc func ff_h264_idct_add16intra_\depth\()_rvv, zve32x li a5, (1 << \depth) - 1 - lla a6, ff_h264_idct_dc_add_\depth\()_c j ff_h264_idct_add16intra_16_rvv endfunc func ff_h264_idct8_add4_\depth\()_rvv, zve32x li a5, (1 << \depth) - 1 - lla a6, ff_h264_idct8_dc_add_\depth\()_c j ff_h264_idct8_add4_16_rvv endfunc .endr From patchwork Thu Jul 18 19:35:46 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50636 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:a742:0:b0:482:c625:d099 with SMTP id f2csp203206vqm; Thu, 18 Jul 2024 12:46:51 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCWUWsWqCZaEgdoYVibGtzRZIJN4Jpl532eU6xu+D89E0xC+ylOHI7Kiyu4e3o6zELfAR9vZstxXvm3qwZtf0JOwiR0+xXr0F7hqaw== X-Google-Smtp-Source: AGHT+IFhu9jW+q9C9gXkjM6fTbZd6HnxG8wYTvbxuv88sbmSgu3JsU6ar+RshR6SD8RZCHkMInYl X-Received: by 2002:a05:6512:690:b0:52c:e086:7953 with SMTP id 2adb3069b0e04-52ee539c939mr4773963e87.4.1721332011310; Thu, 18 Jul 2024 12:46:51 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1721332011; cv=none; d=google.com; s=arc-20160816; b=QoVZfx9PzbbHe3T6vB7GQ7hC+OjqNE1gc9QqYReJN+LwNNbun8jkgU5qi7eikHtLCd y9jgnJqujAzEhxQxp0HBkrJxcOthsC86SEvJVyY6QWWWwoO7btAJr+n3SObxI4gk2SIF M0ixlFv3SHHA7DczNAu/xjDKwgoJruZWuhgUsciqQGhsPspMGuMXvgQY523tuf4ZF67X UTw6gO3q0rpprhKEZhbpRTnRqNG2HFbdBrF/uSGivkP4CqYOWf7qaJ/k+CdJPKo3iXLY rSbpLpVTY08sCHTMuPg/5hhNDItx3dRBTn2UVvm6OvLeNaAWkprkjGTQz89pzwK7vMKr wNbA== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:references:in-reply-to:message-id :date:to:from:delivered-to; bh=DD5ObQ9CzfVEWniFB8/kh/E1u0RI7alp8+vzh6lx5V4=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=j4KInz+6WYGB9dNqvk2HrvMab/NenK7Oce1pWttXBsAQOdlmM7ZDuwJePoVxMw8333 ca57nSsACOdm3WuF0mABUmsMSDgiWfnzPZmvpZ6A84CDNz0WpWA1foEa7RIQ/lbJIhO1 DLoOuoQMmADV2Mbow1f6rXjxaXXay9rLdwdycdwHppI0v3Gl7hrbsDyfJaxHDJpZSRkg udj5kBhOcdglIyyIxQG8864smHqt3fm3irpUE5xxkaT3S2TxCsUKXwv9FhdLFkbigtjU 6evl/Idu7UYnfzDYkY9tcEtFrNrUS+xNWrieQ46nxqXVz/0w3XNQ8n4oTlF3SjQiWTGi QtXQ==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id 2adb3069b0e04-52ef07d8df3si364303e87.567.2024.07.18.12.46.50; Thu, 18 Jul 2024 12:46:51 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 04FFC68DB21; Thu, 18 Jul 2024 22:36:00 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 76A0F68D6BC for ; Thu, 18 Jul 2024 22:35:48 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id E404DC0236 for ; Thu, 18 Jul 2024 22:35:47 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Thu, 18 Jul 2024 22:35:46 +0300 Message-ID: <20240718193546.18939-5-remi@remlab.net> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240718193546.18939-1-remi@remlab.net> References: <20240718193546.18939-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 5/5] lavc/h264dsp: reduce spills in R-V V idct_add16 X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: +/18JgxnbWVq --- libavcodec/riscv/h264idct_rvv.S | 100 ++++++++++++++------------------ 1 file changed, 45 insertions(+), 55 deletions(-) diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index c42db6ef29..c74ea18c19 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -422,9 +422,9 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x .else vsetivli zero, \width, e8, mf4, ta, ma .endif - lh a3, 0(a1) - addi a3, a3, 32 - srai a3, a3, 6 + lh t0, 0(a1) + addi t0, t0, 32 + srai t0, t0, 6 sh zero, 0(a1) .if \width == 8 li a6, \width * \width @@ -435,7 +435,7 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x vsetivli zero, \width * \width, e16, m2, ta, ma .endif vzext.vf2 v0, v24 - vadd.vx v0, v0, a3 + vadd.vx v0, v0, t0 vmax.vx v0, v0, zero .if \width == 8 vsetvli zero, zero, e8, m4, ta, ma @@ -464,33 +464,33 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba .else vsetivli zero, \width, e16, mf2, ta, ma .endif - lw a3, 0(a1) - addi a3, a3, 32 - srai a3, a3, 6 + lw t0, 0(a1) + addi t0, t0, 32 + srai t0, t0, 6 sw zero, 0(a1) - add t4, a0, a2 - sh1add t5, a2, a0 - sh1add t6, a2, t4 + add t1, a0, a2 + sh1add t2, a2, a0 + sh1add t3, a2, t1 .if \width == 8 - sh2add t0, a2, a0 - sh2add t1, a2, t4 - sh2add t2, a2, t5 - sh2add t3, a2, t6 + sh2add t4, a2, a0 + sh2add t5, a2, t1 + sh2add t6, a2, t2 + sh2add a7, a2, t3 .endif vle16.v v0, (a0) - vle16.v v1, (t4) - vle16.v v2, (t5) - vle16.v v3, (t6) + vle16.v v1, (t1) + vle16.v v2, (t2) + vle16.v v3, (t3) .if \width == 8 - vle16.v v4, (t0) - vle16.v v5, (t1) - vle16.v v6, (t2) - vle16.v v7, (t3) + vle16.v v4, (t4) + vle16.v v5, (t5) + vle16.v v6, (t6) + vle16.v v7, (a7) vsetvli a6, zero, e16, m8, ta, ma .else vsetvli a6, zero, e16, m4, ta, ma .endif - vadd.vx v0, v0, a3 + vadd.vx v0, v0, t0 vmax.vx v0, v0, zero vmin.vx v0, v0, a5 .if \width == 8 @@ -499,14 +499,14 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba vsetivli zero, \width, e16, mf2, ta, ma .endif vse16.v v0, (a0) - vse16.v v1, (t4) - vse16.v v2, (t5) - vse16.v v3, (t6) + vse16.v v1, (t1) + vse16.v v2, (t2) + vse16.v v3, (t3) .if \width == 8 - vse16.v v4, (t0) - vse16.v v5, (t1) - vse16.v v6, (t2) - vse16.v v7, (t3) + vse16.v v4, (t4) + vse16.v v5, (t5) + vse16.v v6, (t6) + vse16.v v7, (a7) .endif ret endfunc @@ -536,17 +536,12 @@ endconst .macro idct4_adds type, depth func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x csrwi vxrm, 0 - addi sp, sp, -64 + addi sp, sp, -16 lla t0, ff_h264_scan8 sd s0, (sp) li t1, 32 * (\depth / 8) mv s0, sp sd ra, 8(sp) - sd s1, 16(sp) - sd s2, 24(sp) - sd s3, 32(sp) - sd s4, 40(sp) - sd s5, 48(sp) vsetivli zero, 16, e8, m1, ta, ma vle8.v v8, (t0) .if \depth == 8 @@ -570,23 +565,23 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x vmand.mm v1, v1, v2 .endif vsetvli zero, zero, e16, m2, ta, ma - vmv.x.s s2, v0 - vmv.x.s s3, v1 - li s1, 16 - mv s4, a0 - mv s5, a1 + vmv.x.s a4, v0 + vmv.x.s a7, v1 + mv t4, a0 + mv t5, a1 mv a1, a2 mv a2, a3 + li a3, 16 1: - andi t0, s2, 1 - addi s1, s1, -1 - srli s2, s2, 1 + andi t0, a4, 1 + addi a3, a3, -1 + srli a4, a4, 1 .ifc \type, 16 beqz t0, 3f # if (nnz) .endif - lw t2, (s5) # block_offset[i] - andi t1, s3, 1 - add a0, s4, t2 + lw t2, (t5) # block_offset[i] + andi t1, a7, 1 + add a0, t4, t2 .ifc \type, 16 bnez t1, 2f # if (nnz == 1 && block[i * 16]) .else @@ -600,19 +595,14 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x .endif jal ff_h264_idct4_dc_add_\depth\()_rvv 3: - srli s3, s3, 1 - addi s5, s5, 4 + srli a7, a7, 1 + addi t5, t5, 4 addi a1, a1, 16 * 2 * (\depth / 8) - bnez s1, 1b + bnez a3, 1b - ld s5, 48(sp) - ld s4, 40(sp) - ld s3, 32(sp) - ld s2, 24(sp) - ld s1, 16(sp) ld ra, 8(sp) ld s0, 0(sp) - addi sp, sp, 64 + addi sp, sp, 16 ret endfunc .endm