diff mbox series

[FFmpeg-devel] lavc/h264dsp: R-V V high-depth idct_add{, intra}16, idct8_add4

Message ID 20240715191121.14217-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel] lavc/h264dsp: R-V V high-depth idct_add{, intra}16, idct8_add4 | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch

Commit Message

Rémi Denis-Courmont July 15, 2024, 7:11 p.m. UTC
As with 8-bit, this tends to be faster, but results are all over the
place due to the variable distribution of non-zero coefficients.
---
 libavcodec/riscv/h264dsp_init.c |  77 +++++++++--------
 libavcodec/riscv/h264idct_rvv.S | 147 +++++++++++++++++++++++++-------
 2 files changed, 154 insertions(+), 70 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 4fc695f158..14eea29892 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -40,26 +40,25 @@  void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
 void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                             int alpha, int beta, int8_t *tc0);
 
-void ff_h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add16_8_rvv(uint8_t *dst, const int *blockoffset,
-                              int16_t *block, int stride,
-                              const uint8_t nnzc[5 * 8]);
-void ff_h264_idct_add16intra_8_rvv(uint8_t *dst, const int *blockoffset,
-                                   int16_t *block, int stride,
-                                   const uint8_t nnzc[5 * 8]);
-void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset,
-                              int16_t *block, int stride,
-                              const uint8_t nnzc[5 * 8]);
-
-void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
+#define IDCT_DEPTH(depth) \
+void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
+void ff_h264_idct8_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
+void ff_h264_idct_add16_##depth##_rvv(uint8_t *d, const int *soffset, \
+                                      int16_t *s, int stride, \
+                                      const uint8_t nnzc[5 * 8]); \
+void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \
+                                   int16_t *s, int stride, \
+                                   const uint8_t nnzc[5 * 8]); \
+void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \
+                                      int16_t *s, int stride, \
+                                      const uint8_t nnzc[5 * 8]);
+
+IDCT_DEPTH(8)
+IDCT_DEPTH(9)
+IDCT_DEPTH(10)
+IDCT_DEPTH(12)
+IDCT_DEPTH(14)
+#undef IDCT_DEPTH
 
 void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride);
@@ -106,26 +105,26 @@  av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
             dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
         }
 
-        if (bit_depth == 9) {
-            if (zvl128b)
-                dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
-            dsp->h264_idct8_add = ff_h264_idct8_add_9_rvv;
-        }
-        if (bit_depth == 10) {
-            if (zvl128b)
-                dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
-            dsp->h264_idct8_add = ff_h264_idct8_add_10_rvv;
-        }
-        if (bit_depth == 12) {
-            if (zvl128b)
-                dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
-            dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv;
-        }
-        if (bit_depth == 14) {
-            if (zvl128b)
-                dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
-            dsp->h264_idct8_add = ff_h264_idct8_add_14_rvv;
+#define IDCT_DEPTH(depth) \
+        if (bit_depth == depth) { \
+            if (zvl128b) \
+                dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
+            if (flags & AV_CPU_FLAG_RVB_ADDR) \
+                dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
+            if (__riscv_xlen == 64 && zvl128b) { \
+                dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
+                dsp->h264_idct_add16intra = \
+                    ff_h264_idct_add16intra_##depth##_rvv; \
+            } \
+            if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \
+                dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \
         }
+
+        IDCT_DEPTH(9)
+        IDCT_DEPTH(10)
+        IDCT_DEPTH(12)
+        IDCT_DEPTH(14)
+
         if (bit_depth > 8 && zvl128b) {
             dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv;
             if (flags & AV_CPU_FLAG_RVV_I64)
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 7dd0a524fe..48de65ec0b 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -107,6 +107,7 @@  endfunc
 
 func ff_h264_idct_add_16_rvv, zve32x
         csrwi       vxrm, 0
+.Lidct_add4_16_rvv:
         vsetivli    zero, 4, e32, m1, ta, ma
         addi        t1, a1, 1 * 4 * 4
         vle32.v     v0, (a1)
@@ -147,7 +148,7 @@  func ff_h264_idct_add_16_rvv, zve32x
         vmax.vx     v\n, v\n, zero
         .endr
         .irp    n,0,1,2,3
-        vmin.vx     v\n, v\n, a3
+        vmin.vx     v\n, v\n, a5
         .endr
         vsetvli     zero, zero, e16, mf2, ta, ma
         vncvt.x.x.w v4, v0
@@ -295,9 +296,10 @@  func ff_h264_idct8_add_8_rvv, zve32x
 endfunc
 
 func ff_h264_idct8_add_16_rvv, zve32x
-        li      a4, 8
         csrwi   vxrm, 0
-        vsetivli    a5, 8, e32, m1, ta, ma
+.Lidct8_add_16_rvv:
+        li      a4, 8
+        vsetivli    a3, 8, e32, m1, ta, ma
 1:
         addi    t1, a1, 1 * 8 * 4
         vle32.v     v0, (a1)
@@ -313,11 +315,11 @@  func ff_h264_idct8_add_16_rvv, zve32x
         vle32.v     v5, (t5)
         addi    a7, a1, 7 * 8 * 4
         vle32.v     v6, (t6)
-        sub     a4, a4, a5
+        sub     a4, a4, a3
         vle32.v     v7, (a7)
         jal     t0, ff_h264_idct8_rvv
         vse32.v     v0, (a1)
-        sh2add  a1, a5, a1
+        sh2add  a1, a3, a1
         vse32.v     v1, (t1)
         vse32.v     v2, (t2)
         vse32.v     v3, (t3)
@@ -329,7 +331,7 @@  func ff_h264_idct8_add_16_rvv, zve32x
 
         addi    a1, a1, -8 * 4
         li      a4, 8
-        slli    a6, a5, 3 + 2
+        slli    a6, a3, 3 + 2
 2:
         vsetvli     zero, zero, e32, m1, ta, ma
         vlseg8e32.v v0, (a1)
@@ -348,7 +350,7 @@  func ff_h264_idct8_add_16_rvv, zve32x
         vle16.v     v21, (t5)
         add     a7, t6, a2
         vle16.v     v22, (t6)
-        sub     a4, a4, a5
+        sub     a4, a4, a3
         vle16.v     v23, (a7)
         .irp    n,0,1,2,3,4,5,6,7
         vssra.vi    v\n, v\n, 6
@@ -368,7 +370,7 @@  func ff_h264_idct8_add_16_rvv, zve32x
         vmax.vx     v\n, v\n, zero
         .endr
         .irp    n,0,1,2,3,4,5,6,7
-        vmin.vx     v\n, v\n, a3
+        vmin.vx     v\n, v\n, a5
         .endr
         vsetvli     zero, zero, e16, mf2, ta, ma
         vncvt.x.x.w v16, v0
@@ -380,7 +382,7 @@  func ff_h264_idct8_add_16_rvv, zve32x
         vncvt.x.x.w v22, v6
         vncvt.x.x.w v23, v7
         vse16.v     v16, (a0)
-        sh1add  a0, a5, a0
+        sh1add  a0, a3, a0
         vse16.v     v17, (t1)
         vse16.v     v18, (t2)
         vse16.v     v19, (t3)
@@ -400,12 +402,12 @@  endfunc
 
 .irp    depth, 9, 10, 12, 14
 func ff_h264_idct_add_\depth\()_rvv, zve32x
-        li      a3, (1 << \depth) - 1
+        li      a5, (1 << \depth) - 1
         j       ff_h264_idct_add_16_rvv
 endfunc
 
 func ff_h264_idct8_add_\depth\()_rvv, zve32x
-        li      a3, (1 << \depth) - 1
+        li      a5, (1 << \depth) - 1
         j       ff_h264_idct8_add_16_rvv
 endfunc
 .endr
@@ -416,13 +418,13 @@  const ff_h264_scan8
 endconst
 
 #if (__riscv_xlen == 64)
-.irp    depth, 8
+.irp    depth, 8, 16
 func ff_h264_idct_add16_\depth\()_rvv, zve32x
         csrwi   vxrm, 0
-        addi    sp, sp, -80
+        addi    sp, sp, -96
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
-        li      t1, 32 << (\depth > 8)
+        li      t1, 32 * (\depth / 8)
         mv      s0, sp
         sd      ra,  8(sp)
         sd      s1, 16(sp)
@@ -432,9 +434,19 @@  func ff_h264_idct_add16_\depth\()_rvv, zve32x
         sd      s5, 48(sp)
         sd      s6, 56(sp)
         sd      s7, 64(sp)
+.if \depth > 8
+        sd      s8, 72(sp)
+        sd      s9, 80(sp)
+        mv      s8, a5
+        mv      s9, a6
+.endif
         vsetivli  zero, 16, e8, m1, ta, ma
         vle8.v    v8, (t0)
+.if \depth == 8
         vlse16.v  v16, (a2), t1
+.else
+        vlse32.v  v16, (a2), t1
+.endif
         vluxei8.v v12, (a4), v8
 .if \depth == 8
         vsetvli   zero, zero, e16, m2, ta, ma
@@ -464,17 +476,28 @@  func ff_h264_idct_add16_\depth\()_rvv, zve32x
         mv      a1, s6
         mv      a2, s7
         add     a0, s4, t2
-        beqz    t1, 2f    # if (nnz == 1 && block[i * 16])
-        call    ff_h264_idct_dc_add_\depth\()_c
+.if \depth > 8
+        mv      a5, s8
+.endif
+        bnez    t1, 2f    # if (nnz == 1 && block[i * 16])
+        jal     .Lidct_add4_\depth\()_rvv
         j       3f
 2:
-        call    .Lidct_add4_\depth\()_rvv
+.if \depth == 8
+        call    ff_h264_idct_dc_add_\depth\()_c
+.else
+        jalr    s9
+.endif
 3:
         srli    s3, s3, 1
         addi    s5, s5, 4
-        addi    s6, s6, 16 * 2 << (\depth > 8)
+        addi    s6, s6, 16 * 2 * (\depth / 8)
         bnez    s1, 1b
 
+.if \depth > 8
+        ld      s9, 80(sp)
+        ld      s8, 72(sp)
+.endif
         ld      s7, 64(sp)
         ld      s6, 56(sp)
         ld      s5, 48(sp)
@@ -484,16 +507,16 @@  func ff_h264_idct_add16_\depth\()_rvv, zve32x
         ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 80
+        addi    sp, sp, 96
         ret
 endfunc
 
 func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         csrwi   vxrm, 0
-        addi    sp, sp, -80
+        addi    sp, sp, -96
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
-        li      t1, 32 << (\depth > 8)
+        li      t1, 32 * (\depth / 8)
         mv      s0, sp
         sd      ra,  8(sp)
         sd      s1, 16(sp)
@@ -503,9 +526,19 @@  func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         sd      s5, 48(sp)
         sd      s6, 56(sp)
         sd      s7, 64(sp)
+.if \depth > 8
+        sd      s8, 72(sp)
+        sd      s9, 80(sp)
+        mv      s8, a5
+        mv      s9, a6
+.endif
         vsetivli  zero, 16, e8, m1, ta, ma
         vle8.v    v8, (t0)
+.if \depth == 8
         vlse16.v  v16, (a2), t1
+.else
+        vlse32.v  v16, (a2), t1
+.endif
         vluxei8.v v12, (a4), v8
 .if \depth == 8
         vsetvli   zero, zero, e16, m2, ta, ma
@@ -532,18 +565,29 @@  func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         mv      a1, s6
         mv      a2, s7
         add     a0, s4, t2
+.if \depth > 8
+        mv      a5, s8
+.endif
         beqz    t0, 2f     # if (nnzc[scan8[i]])
-        call    .Lidct_add4_\depth\()_rvv
+        jal     .Lidct_add4_\depth\()_rvv
         j       3f
 2:
         beqz    t1, 3f    # if (block[i * 16])
+.if \depth == 8
         call    ff_h264_idct_dc_add_\depth\()_c
+.else
+        jalr    s9
+.endif
 3:
         srli    s3, s3, 1
         addi    s5, s5, 4
-        addi    s6, s6, 16 * 2 << (\depth > 8)
+        addi    s6, s6, 16 * 2 * (\depth / 8)
         bnez    s1, 1b
 
+.if \depth > 8
+        ld      s9, 80(sp)
+        ld      s8, 72(sp)
+.endif
         ld      s7, 64(sp)
         ld      s6, 56(sp)
         ld      s5, 48(sp)
@@ -553,16 +597,16 @@  func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 80
+        addi    sp, sp, 96
         ret
 endfunc
 
 func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         csrwi       vxrm, 0
-        addi    sp, sp, -80
+        addi    sp, sp, -96
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
-        li      t1, 4 * 32 << (\depth > 8)
+        li      t1, 4 * 32 * (\depth / 8)
         mv      s0, sp
         li      t2, 4
         sd      ra,  8(sp)
@@ -573,9 +617,19 @@  func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         sd      s5, 48(sp)
         sd      s6, 56(sp)
         sd      s7, 64(sp)
+.if \depth > 8
+        sd      s8, 72(sp)
+        sd      s9, 80(sp)
+        mv      s8, a5
+        mv      s9, a6
+.endif
         vsetivli  zero, 4, e8, mf4, ta, ma
         vlse8.v   v8, (t0), t2
+.if \depth == 8
         vlse16.v  v16, (a2), t1
+.else
+        vlse32.v  v16, (a2), t1
+.endif
         vluxei8.v v12, (a4), v8
 .if \depth == 8
         vsetvli   zero, zero, e16, mf2, ta, ma
@@ -604,17 +658,28 @@  func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         mv      a1, s6
         mv      a2, s7
         add     a0, s4, t2
-        beqz    t1, 2f    # if (nnz == 1 && block[i * 16])
-        call    ff_h264_idct8_dc_add_\depth\()_c
+.if \depth > 8
+        mv      a5, s8
+.endif
+        bnez    t1, 2f    # if (nnz == 1 && block[i * 16])
+        jal     .Lidct8_add_\depth\()_rvv
         j       3f
 2:
-        call    .Lidct8_add_\depth\()_rvv
+.if \depth == 8
+        call    ff_h264_idct8_dc_add_\depth\()_c
+.else
+        jalr    s9
+.endif
 3:
         srli    s3, s3, 1
         addi    s5, s5, 4 * 4
-        addi    s6, s6, 4 * 16 * 2 << (\depth > 8)
+        addi    s6, s6, 4 * 16 * 2 * (\depth / 8)
         bnez    s1, 1b
 
+.if \depth > 8
+        ld      s9, 80(sp)
+        ld      s8, 72(sp)
+.endif
         ld      s7, 64(sp)
         ld      s6, 56(sp)
         ld      s5, 48(sp)
@@ -624,8 +689,28 @@  func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 80
+        addi    sp, sp, 96
         ret
 endfunc
 .endr
+
+.irp    depth, 9, 10, 12, 14
+func ff_h264_idct_add16_\depth\()_rvv, zve32x
+        li      a5, (1 << \depth) - 1
+        lla     a6, ff_h264_idct_dc_add_\depth\()_c
+        j       ff_h264_idct_add16_16_rvv
+endfunc
+
+func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
+        li      a5, (1 << \depth) - 1
+        lla     a6, ff_h264_idct_dc_add_\depth\()_c
+        j       ff_h264_idct_add16intra_16_rvv
+endfunc
+
+func ff_h264_idct8_add4_\depth\()_rvv, zve32x
+        li      a5, (1 << \depth) - 1
+        lla     a6, ff_h264_idct8_dc_add_\depth\()_c
+        j       ff_h264_idct8_add4_16_rvv
+endfunc
+.endr
 #endif