diff mbox series

[FFmpeg-devel,PATCHv4,1/4] lavu/riscv: add assembler macros for adjusting vector LMUL

Message ID 20240516164840.19025-1-remi@remlab.net
State Accepted
Commit ee1526c05fdfb4a96e492b5c8c2950b555ec7bab
Headers show
Series [FFmpeg-devel,PATCHv4,1/4] lavu/riscv: add assembler macros for adjusting vector LMUL | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont May 16, 2024, 4:48 p.m. UTC
vtype_vli computes the VTYPE value with the optimal LMUL for a given
element width, tail and mask policies and a run-time vector length.

vtype_ivli does the same, but with the compile-time constant vector
length.

vwtypei and vntypei can be used to widen or narrow a VTYPE value for
use in mixed-width vector-optimised functions.
---
 libavutil/riscv/asm.S | 166 +++++++++++++++++++++++++++++-------------
 1 file changed, 117 insertions(+), 49 deletions(-)
diff mbox series

Patch

diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S
index 14be5055f5..1e6358dcb5 100644
--- a/libavutil/riscv/asm.S
+++ b/libavutil/riscv/asm.S
@@ -96,77 +96,145 @@ 
         .endm
 #endif
 
-        /* Convenience macro to load a Vector type (vtype) as immediate */
-        .macro  lvtypei rd, e, m=m1, tp=tu, mp=mu
+#if defined (__riscv_v_elen)
+# define RV_V_ELEN __riscv_v_elen
+#else
+/* Run-time detection of the V extension implies ELEN >= 64. */
+# define RV_V_ELEN 64
+#endif
+#if RV_V_ELEN == 32
+# define VSEW_MAX 2
+#else
+# define VSEW_MAX 3
+#endif
 
-        .ifc \e,e8
-        .equ ei, 0
+        .macro  parse_vtype ew, tp, mp
+        .ifc    \ew,e8
+        .equ    vsew, 0
         .else
-        .ifc \e,e16
-        .equ ei, 8
+        .ifc    \ew,e16
+        .equ    vsew, 1
         .else
-        .ifc \e,e32
-        .equ ei, 16
+        .ifc    \ew,e32
+        .equ    vsew, 2
         .else
-        .ifc \e,e64
-        .equ ei, 24
+        .ifc    \ew,e64
+        .equ    vsew, 3
         .else
-        .error "Unknown element type"
+        .error  "Unknown element width \ew"
         .endif
         .endif
         .endif
         .endif
 
-        .ifc \m,m1
-        .equ mi, 0
-        .else
-        .ifc \m,m2
-        .equ mi, 1
-        .else
-        .ifc \m,m4
-        .equ mi, 2
+        .ifc    \tp,tu
+        .equ    tp, 0
         .else
-        .ifc \m,m8
-        .equ mi, 3
+        .ifc    \tp,ta
+        .equ    tp, 1
         .else
-        .ifc \m,mf8
-        .equ mi, 5
-        .else
-        .ifc \m,mf4
-        .equ mi, 6
-        .else
-        .ifc \m,mf2
-        .equ mi, 7
-        .else
-        .error "Unknown multiplier"
-        .equ mi, 3
-        .endif
-        .endif
-        .endif
-        .endif
-        .endif
+        .error  "Unknown tail policy \tp"
         .endif
         .endif
 
-        .ifc \tp,tu
-        .equ tpi, 0
+        .ifc    \mp,mu
+        .equ    mp, 0
         .else
-        .ifc \tp,ta
-        .equ tpi, 64
+        .ifc    \mp,ma
+        .equ    mp, 1
         .else
-        .error "Unknown tail policy"
+        .error  "Unknown mask policy \mp"
         .endif
         .endif
+        .endm
 
-        .ifc \mp,mu
-        .equ mpi, 0
-        .else
-        .ifc \mp,ma
-        .equ mpi, 128
+        /**
+         * Gets the vector type with the smallest suitable LMUL value.
+         * @param[out] rd vector type destination register
+         * @param vl vector length constant
+         * @param ew element width: e8, e16, e32 or e64
+         * @param tp tail policy: tu or ta
+         * @param mp mask policty: mu or ma
+         */
+        .macro  vtype_ivli rd, avl, ew, tp=tu, mp=mu
+        .if     \avl <= 1
+        .equ    log2vl, 0
+        .elseif \avl <= 2
+        .equ    log2vl, 1
+        .elseif \avl <= 4
+        .equ    log2vl, 2
+        .elseif \avl <= 8
+        .equ    log2vl, 3
+        .elseif \avl <= 16
+        .equ    log2vl, 4
+        .elseif \avl <= 32
+        .equ    log2vl, 5
+        .elseif \avl <= 64
+        .equ    log2vl, 6
+        .elseif \avl <= 128
+        .equ    log2vl, 7
         .else
-        .error "Unknown mask policy"
+        .error  "Vector length \avl out of range"
         .endif
+        parse_vtype \ew, \tp, \mp
+        csrr    \rd, vlenb
+        clz     \rd, \rd
+        addi    \rd, \rd, log2vl + 1 + VSEW_MAX - __riscv_xlen
+        max     \rd, \rd, zero // VLMUL must be >= VSEW - VSEW_MAX
+        .if     vsew < VSEW_MAX
+        addi    \rd, \rd, vsew - VSEW_MAX
+        andi    \rd, \rd, 7
         .endif
+        ori     \rd, \rd, (vsew << 3) | (tp << 6) | (mp << 7)
+        .endm
+
+        /**
+         * Gets the vector type with the smallest suitable LMUL value.
+         * @param[out] rd vector type destination register
+         * @param rs vector length source register
+         * @param[out] tmp temporary register to be clobbered
+         * @param ew element width: e8, e16, e32 or e64
+         * @param tp tail policy: tu or ta
+         * @param mp mask policty: mu or ma
+         */
+        .macro  vtype_vli rd, rs, tmp, ew, tp=tu, mp=mu
+        parse_vtype \ew, \tp, \mp
+        /*
+         * The difference between the CLZ's notionally equals the VLMUL value
+         * for 4-bit elements. But we want the value for SEW_MAX-bit elements.
+         */
+        slli    \tmp, \rs, 1 + VSEW_MAX
+        csrr    \rd, vlenb
+        addi    \tmp, \tmp, -1
+        clz     \rd, \rd
+        clz     \tmp, \tmp
+        sub     \rd, \rd, \tmp
+        max     \rd, \rd, zero // VLMUL must be >= VSEW - VSEW_MAX
+        .if     vsew < VSEW_MAX
+        addi    \rd, \rd, vsew - VSEW_MAX
+        andi    \rd, \rd, 7
+        .endif
+        ori     \rd, \rd, (vsew << 3) | (tp << 6) | (mp << 7)
+        .endm
+
+        /**
+         * Widens a vector type.
+         * @param[out] rd widened vector type destination register
+         * @param rs vector type source register
+         * @param n number of times to widen (once by default)
+         */
+        .macro  vwtypei rd, rs, n=1
+        xori    \rd, \rs, 4
+        addi    \rd, \rd, (\n) * 011
+        xori    \rd, \rd, 4
+        .endm
 
-        li      \rd, (ei | mi | tpi | mpi)
+        /**
+         * Narrows a vector type.
+         * @param[out] rd narrowed vector type destination register
+         * @param rs vector type source register
+         * @param n number of times to narrow (once by default)
+         */
+        .macro  vntypei rd, rs, n=1
+        vwtypei \rd, \rs, -(\n)
         .endm