[FFmpeg-devel,2/5] lavc/h264dsp: move R-V V idct_dc_add

Message ID	20240718193546.18939-2-remi@remlab.net
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net> To: ffmpeg-devel@ffmpeg.org Date: Thu, 18 Jul 2024 22:35:43 +0300 Message-ID: <20240718193546.18939-2-remi@remlab.net> In-Reply-To: <20240718193546.18939-1-remi@remlab.net> References: <20240718193546.18939-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,1/5] lavc/h264dsp: factor some mostly identical R-V V code \| expand [FFmpeg-devel,1/5] lavc/h264dsp: factor some mostly identical R-V V code [FFmpeg-devel,2/5] lavc/h264dsp: move R-V V idct_dc_add [FFmpeg-devel,3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add [FFmpeg-devel,4/5] lavc/h264dsp: reuse the R-V V IDCT DC add functions [FFmpeg-devel,5/5] lavc/h264dsp: reduce spills in R-V V idct_add16

Message ID

20240718193546.18939-2-remi@remlab.net

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net>
To: ffmpeg-devel@ffmpeg.org
Date: Thu, 18 Jul 2024 22:35:43 +0300
Message-ID: <20240718193546.18939-2-remi@remlab.net>
In-Reply-To: <20240718193546.18939-1-remi@remlab.net>
References: <20240718193546.18939-1-remi@remlab.net>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 2/5] lavc/h264dsp: move R-V V idct_dc_add
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel,1/5] lavc/h264dsp: factor some mostly identical R-V V code | expand

Commit Message

Rémi Denis-Courmont July 18, 2024, 7:35 p.m. UTC

From: "J. Dekker" <jdek@itanimul.li>

No functional changes. This just moves the assembler so that it can be
referenced by other functions in h264idct_rvv.S with local jumps.

Edited-by: Rémi Denis-Courmont <remi@remlab.net>
---
 libavcodec/riscv/h264dsp_rvv.S  | 103 -------------------------------
 libavcodec/riscv/h264idct_rvv.S | 105 ++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 103 deletions(-)

diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 5c70709cf2..ed6a16a9c4 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -1,7 +1,6 @@ 
 /*
  * SPDX-License-Identifier: BSD-2-Clause
  *
- * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
  * Copyright © 2024 Rémi Denis-Courmont.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -326,105 +325,3 @@  func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
         vssseg6e8.v v8, (a0), a1
         ret
 endfunc
-
-.macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
-.if \width == 8
-        vsetivli        zero, \width, e16, m1, ta, ma
-.else
-        vsetivli        zero, \width, e16, mf2, ta, ma
-.endif
-        lh              a3, 0(a1)
-        addi            a3, a3, 32
-        srai            a3, a3, 6
-        sh              zero, 0(a1)
-.if \width == 8
-        vlse64.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m8, ta, ma
-.else
-        vlse32.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m4, ta, ma
-.endif
-        vzext.vf2       v0, v24
-        vadd.vx         v0, v0, a3
-        vmax.vx         v0, v0, zero
-.if \width == 8
-        vsetvli         zero, zero, e8, m4, ta, ma
-.else
-        vsetvli         zero, zero, e8, m2, ta, ma
-.endif
-        vnclipu.wi      v24, v0, 0
-        vsetivli        zero, \width, e8, m1, ta, ma
-.if \width == 8
-        vsse64.v        v24, (a0), a2
-.else
-        vsse32.v        v24, (a0), a2
-.endif
-        ret
-endfunc
-.endm
-
-idct_dc_add8 4
-idct_dc_add8 8
-
-.macro idct_dc_add width
-func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
-        vsetivli        zero, \width, e16, m1, ta, ma
-        lw              a3, 0(a1)
-        addi            a3, a3, 32
-        srai            a3, a3, 6
-        sw              zero, 0(a1)
-        add             t4, a0, a2
-        sh1add          t5, a2, a0
-        sh1add          t6, a2, t4
-.if \width == 8
-        sh2add          t0, a2, a0
-        sh2add          t1, a2, t4
-        sh2add          t2, a2, t5
-        sh2add          t3, a2, t6
-.endif
-        vle16.v         v0, (a0)
-        vle16.v         v1, (t4)
-        vle16.v         v2, (t5)
-        vle16.v         v3, (t6)
-.if \width == 8
-        vle16.v         v4, (t0)
-        vle16.v         v5, (t1)
-        vle16.v         v6, (t2)
-        vle16.v         v7, (t3)
-        vsetvli         a6, zero, e16, m8, ta, ma
-.else
-        vsetvli         a6, zero, e16, m4, ta, ma
-.endif
-        vadd.vx         v0, v0, a3
-        vmax.vx         v0, v0, zero
-        vmin.vx         v0, v0, a5
-        vsetivli        zero, \width, e16, m1, ta, ma
-        vse16.v         v0, (a0)
-        vse16.v         v1, (t4)
-        vse16.v         v2, (t5)
-        vse16.v         v3, (t6)
-.if \width == 8
-        vse16.v         v4, (t0)
-        vse16.v         v5, (t1)
-        vse16.v         v6, (t2)
-        vse16.v         v7, (t3)
-.endif
-        ret
-endfunc
-.endm
-
-idct_dc_add 4
-idct_dc_add 8
-
-.irp depth,9,10,12,14
-func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
-        li              a5, (1 << \depth) - 1
-        j               ff_h264_idct4_dc_add_16_rvv
-endfunc
-
-func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
-        li              a5, (1 << \depth) - 1
-        j               ff_h264_idct8_dc_add_16_rvv
-endfunc
-.endr
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 505f491308..37b27fc92a 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -1,4 +1,7 @@ 
 /*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
  * Copyright © 2024 Rémi Denis-Courmont.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -412,6 +415,108 @@  func ff_h264_idct8_add_\depth\()_rvv, zve32x
 endfunc
 .endr
 
+.macro idct_dc_add8 width
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+.if \width == 8
+        vsetivli        zero, \width, e16, m1, ta, ma
+.else
+        vsetivli        zero, \width, e16, mf2, ta, ma
+.endif
+        lh              a3, 0(a1)
+        addi            a3, a3, 32
+        srai            a3, a3, 6
+        sh              zero, 0(a1)
+.if \width == 8
+        vlse64.v        v24, (a0), a2
+        vsetvli         t0, zero, e16, m8, ta, ma
+.else
+        vlse32.v        v24, (a0), a2
+        vsetvli         t0, zero, e16, m4, ta, ma
+.endif
+        vzext.vf2       v0, v24
+        vadd.vx         v0, v0, a3
+        vmax.vx         v0, v0, zero
+.if \width == 8
+        vsetvli         zero, zero, e8, m4, ta, ma
+.else
+        vsetvli         zero, zero, e8, m2, ta, ma
+.endif
+        vnclipu.wi      v24, v0, 0
+        vsetivli        zero, \width, e8, m1, ta, ma
+.if \width == 8
+        vsse64.v        v24, (a0), a2
+.else
+        vsse32.v        v24, (a0), a2
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc_add8 4
+idct_dc_add8 8
+
+.macro idct_dc_add width
+func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+        vsetivli        zero, \width, e16, m1, ta, ma
+        lw              a3, 0(a1)
+        addi            a3, a3, 32
+        srai            a3, a3, 6
+        sw              zero, 0(a1)
+        add             t4, a0, a2
+        sh1add          t5, a2, a0
+        sh1add          t6, a2, t4
+.if \width == 8
+        sh2add          t0, a2, a0
+        sh2add          t1, a2, t4
+        sh2add          t2, a2, t5
+        sh2add          t3, a2, t6
+.endif
+        vle16.v         v0, (a0)
+        vle16.v         v1, (t4)
+        vle16.v         v2, (t5)
+        vle16.v         v3, (t6)
+.if \width == 8
+        vle16.v         v4, (t0)
+        vle16.v         v5, (t1)
+        vle16.v         v6, (t2)
+        vle16.v         v7, (t3)
+        vsetvli         a6, zero, e16, m8, ta, ma
+.else
+        vsetvli         a6, zero, e16, m4, ta, ma
+.endif
+        vadd.vx         v0, v0, a3
+        vmax.vx         v0, v0, zero
+        vmin.vx         v0, v0, a5
+        vsetivli        zero, \width, e16, m1, ta, ma
+        vse16.v         v0, (a0)
+        vse16.v         v1, (t4)
+        vse16.v         v2, (t5)
+        vse16.v         v3, (t6)
+.if \width == 8
+        vse16.v         v4, (t0)
+        vse16.v         v5, (t1)
+        vse16.v         v6, (t2)
+        vse16.v         v7, (t3)
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc_add 4
+idct_dc_add 8
+
+.irp depth,9,10,12,14
+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
+        li              a5, (1 << \depth) - 1
+        j               ff_h264_idct4_dc_add_16_rvv
+endfunc
+
+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
+        li              a5, (1 << \depth) - 1
+        j               ff_h264_idct8_dc_add_16_rvv
+endfunc
+.endr
+
 const ff_h264_scan8
         .byte   014, 015, 024, 025, 016, 017, 026, 027
         .byte   034, 035, 044, 045, 036, 037, 046, 047

[FFmpeg-devel,2/5] lavc/h264dsp: move R-V V idct_dc_add

Commit Message

Patch